Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 28 Jan 2022 17:00:26 +0000 (19:00 +0200)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 28 Jan 2022 17:00:26 +0000 (19:00 +0200)
Pull kvm fixes from Paolo Bonzini:
 "Two larger x86 series:

   - Redo incorrect fix for SEV/SMAP erratum

   - Windows 11 Hyper-V workaround

  Other x86 changes:

   - Various x86 cleanups

   - Re-enable access_tracking_perf_test

   - Fix for #GP handling on SVM

   - Fix for CPUID leaf 0Dh in KVM_GET_SUPPORTED_CPUID

   - Fix for ICEBP in interrupt shadow

   - Avoid false-positive RCU splat

   - Enable Enlightened MSR-Bitmap support for real

  ARM:

   - Correctly update the shadow register on exception injection when
     running in nVHE mode

   - Correctly use the mm_ops indirection when performing cache
     invalidation from the page-table walker

   - Restrict the vgic-v3 workaround for SEIS to the two known broken
     implementations

  Generic code changes:

   - Dead code cleanup"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (43 commits)
  KVM: eventfd: Fix false positive RCU usage warning
  KVM: nVMX: Allow VMREAD when Enlightened VMCS is in use
  KVM: nVMX: Implement evmcs_field_offset() suitable for handle_vmread()
  KVM: nVMX: Rename vmcs_to_field_offset{,_table}
  KVM: nVMX: eVMCS: Filter out VM_EXIT_SAVE_VMX_PREEMPTION_TIMER
  KVM: nVMX: Also filter MSR_IA32_VMX_TRUE_PINBASED_CTLS when eVMCS
  selftests: kvm: check dynamic bits against KVM_X86_XCOMP_GUEST_SUPP
  KVM: x86: add system attribute to retrieve full set of supported xsave states
  KVM: x86: Add a helper to retrieve userspace address from kvm_device_attr
  selftests: kvm: move vm_xsave_req_perm call to amx_test
  KVM: x86: Sync the states size with the XCR0/IA32_XSS at, any time
  KVM: x86: Update vCPU's runtime CPUID on write to MSR_IA32_XSS
  KVM: x86: Keep MSR_IA32_XSS unchanged for INIT
  KVM: x86: Free kvm_cpuid_entry2 array on post-KVM_RUN KVM_SET_CPUID{,2}
  KVM: nVMX: WARN on any attempt to allocate shadow VMCS for vmcs02
  KVM: selftests: Don't skip L2's VMCALL in SMM test for SVM guest
  KVM: x86: Check .flags in kvm_cpuid_check_equal() too
  KVM: x86: Forcibly leave nested virt when SMM state is toggled
  KVM: SVM: drop unnecessary code in svm_hv_vmcb_dirty_nested_enlightenments()
  KVM: SVM: hyper-v: Enable Enlightened MSR-Bitmap support for real
  ...

35 files changed:
Documentation/virt/kvm/api.rst
arch/arm64/kvm/hyp/exception.c
arch/arm64/kvm/hyp/pgtable.c
arch/arm64/kvm/hyp/vgic-v3-sr.c
arch/arm64/kvm/vgic/vgic-v3.c
arch/x86/include/asm/kvm_host.h
arch/x86/include/uapi/asm/kvm.h
arch/x86/kvm/cpuid.c
arch/x86/kvm/lapic.c
arch/x86/kvm/svm/nested.c
arch/x86/kvm/svm/sev.c
arch/x86/kvm/svm/svm.c
arch/x86/kvm/svm/svm.h
arch/x86/kvm/svm/svm_onhyperv.h
arch/x86/kvm/vmx/capabilities.h
arch/x86/kvm/vmx/evmcs.c
arch/x86/kvm/vmx/evmcs.h
arch/x86/kvm/vmx/nested.c
arch/x86/kvm/vmx/vmcs12.c
arch/x86/kvm/vmx/vmcs12.h
arch/x86/kvm/vmx/vmx.c
arch/x86/kvm/x86.c
arch/x86/kvm/xen.c
include/uapi/linux/kvm.h
tools/arch/x86/include/uapi/asm/kvm.h
tools/include/uapi/linux/kvm.h
tools/testing/selftests/kvm/Makefile
tools/testing/selftests/kvm/include/kvm_util_base.h
tools/testing/selftests/kvm/include/x86_64/processor.h
tools/testing/selftests/kvm/lib/kvm_util.c
tools/testing/selftests/kvm/lib/x86_64/processor.c
tools/testing/selftests/kvm/x86_64/amx_test.c
tools/testing/selftests/kvm/x86_64/smm_test.c
virt/kvm/eventfd.c
virt/kvm/kvm_main.c

index bb8cfdd..a426710 100644 (file)
@@ -3268,6 +3268,7 @@ number.
 
 :Capability: KVM_CAP_DEVICE_CTRL, KVM_CAP_VM_ATTRIBUTES for vm device,
              KVM_CAP_VCPU_ATTRIBUTES for vcpu device
+             KVM_CAP_SYS_ATTRIBUTES for system (/dev/kvm) device (no set)
 :Type: device ioctl, vm ioctl, vcpu ioctl
 :Parameters: struct kvm_device_attr
 :Returns: 0 on success, -1 on error
@@ -3302,7 +3303,8 @@ transferred is defined by the particular attribute.
 ------------------------
 
 :Capability: KVM_CAP_DEVICE_CTRL, KVM_CAP_VM_ATTRIBUTES for vm device,
-            KVM_CAP_VCPU_ATTRIBUTES for vcpu device
+             KVM_CAP_VCPU_ATTRIBUTES for vcpu device
+             KVM_CAP_SYS_ATTRIBUTES for system (/dev/kvm) device
 :Type: device ioctl, vm ioctl, vcpu ioctl
 :Parameters: struct kvm_device_attr
 :Returns: 0 on success, -1 on error
index 0418399..c5d0097 100644 (file)
@@ -38,7 +38,10 @@ static inline void __vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
 
 static void __vcpu_write_spsr(struct kvm_vcpu *vcpu, u64 val)
 {
-       write_sysreg_el1(val, SYS_SPSR);
+       if (has_vhe())
+               write_sysreg_el1(val, SYS_SPSR);
+       else
+               __vcpu_sys_reg(vcpu, SPSR_EL1) = val;
 }
 
 static void __vcpu_write_spsr_abt(struct kvm_vcpu *vcpu, u64 val)
index 844a6f0..2cb3867 100644 (file)
@@ -983,13 +983,9 @@ static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
         */
        stage2_put_pte(ptep, mmu, addr, level, mm_ops);
 
-       if (need_flush) {
-               kvm_pte_t *pte_follow = kvm_pte_follow(pte, mm_ops);
-
-               dcache_clean_inval_poc((unsigned long)pte_follow,
-                                   (unsigned long)pte_follow +
-                                           kvm_granule_size(level));
-       }
+       if (need_flush && mm_ops->dcache_clean_inval_poc)
+               mm_ops->dcache_clean_inval_poc(kvm_pte_follow(pte, mm_ops),
+                                              kvm_granule_size(level));
 
        if (childp)
                mm_ops->put_page(childp);
@@ -1151,15 +1147,13 @@ static int stage2_flush_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
        struct kvm_pgtable *pgt = arg;
        struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops;
        kvm_pte_t pte = *ptep;
-       kvm_pte_t *pte_follow;
 
        if (!kvm_pte_valid(pte) || !stage2_pte_cacheable(pgt, pte))
                return 0;
 
-       pte_follow = kvm_pte_follow(pte, mm_ops);
-       dcache_clean_inval_poc((unsigned long)pte_follow,
-                           (unsigned long)pte_follow +
-                                   kvm_granule_size(level));
+       if (mm_ops->dcache_clean_inval_poc)
+               mm_ops->dcache_clean_inval_poc(kvm_pte_follow(pte, mm_ops),
+                                              kvm_granule_size(level));
        return 0;
 }
 
index 20db2f2..4fb419f 100644 (file)
@@ -983,6 +983,9 @@ static void __vgic_v3_read_ctlr(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
        val = ((vtr >> 29) & 7) << ICC_CTLR_EL1_PRI_BITS_SHIFT;
        /* IDbits */
        val |= ((vtr >> 23) & 7) << ICC_CTLR_EL1_ID_BITS_SHIFT;
+       /* SEIS */
+       if (kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_SEIS_MASK)
+               val |= BIT(ICC_CTLR_EL1_SEIS_SHIFT);
        /* A3V */
        val |= ((vtr >> 21) & 1) << ICC_CTLR_EL1_A3V_SHIFT;
        /* EOImode */
index a33d436..b549af8 100644 (file)
@@ -609,6 +609,18 @@ static int __init early_gicv4_enable(char *buf)
 }
 early_param("kvm-arm.vgic_v4_enable", early_gicv4_enable);
 
+static const struct midr_range broken_seis[] = {
+       MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM),
+       MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM),
+       {},
+};
+
+static bool vgic_v3_broken_seis(void)
+{
+       return ((kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_SEIS_MASK) &&
+               is_midr_in_range_list(read_cpuid_id(), broken_seis));
+}
+
 /**
  * vgic_v3_probe - probe for a VGICv3 compatible interrupt controller
  * @info:      pointer to the GIC description
@@ -676,9 +688,10 @@ int vgic_v3_probe(const struct gic_kvm_info *info)
                group1_trap = true;
        }
 
-       if (kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_SEIS_MASK) {
-               kvm_info("GICv3 with locally generated SEI\n");
+       if (vgic_v3_broken_seis()) {
+               kvm_info("GICv3 with broken locally generated SEI\n");
 
+               kvm_vgic_global_state.ich_vtr_el2 &= ~ICH_VTR_SEIS_MASK;
                group0_trap = true;
                group1_trap = true;
                if (ich_vtr_el2 & ICH_VTR_TDS_MASK)
index 1384517..6e7c545 100644 (file)
@@ -1483,7 +1483,8 @@ struct kvm_x86_ops {
 
        int (*get_msr_feature)(struct kvm_msr_entry *entry);
 
-       bool (*can_emulate_instruction)(struct kvm_vcpu *vcpu, void *insn, int insn_len);
+       bool (*can_emulate_instruction)(struct kvm_vcpu *vcpu, int emul_type,
+                                       void *insn, int insn_len);
 
        bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu);
        int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu);
@@ -1496,6 +1497,7 @@ struct kvm_x86_ops {
 };
 
 struct kvm_x86_nested_ops {
+       void (*leave_nested)(struct kvm_vcpu *vcpu);
        int (*check_events)(struct kvm_vcpu *vcpu);
        bool (*hv_timer_pending)(struct kvm_vcpu *vcpu);
        void (*triple_fault)(struct kvm_vcpu *vcpu);
@@ -1861,7 +1863,6 @@ int kvm_cpu_has_extint(struct kvm_vcpu *v);
 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
 int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
 void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
-void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu);
 
 int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
                    unsigned long ipi_bitmap_high, u32 min,
index 2da3316..bf6e960 100644 (file)
@@ -452,6 +452,9 @@ struct kvm_sync_regs {
 
 #define KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE        0x00000001
 
+/* attributes for system fd (group 0) */
+#define KVM_X86_XCOMP_GUEST_SUPP       0
+
 struct kvm_vmx_nested_state_data {
        __u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
        __u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
index 3902c28..28be02a 100644 (file)
@@ -133,6 +133,7 @@ static int kvm_cpuid_check_equal(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2
                orig = &vcpu->arch.cpuid_entries[i];
                if (e2[i].function != orig->function ||
                    e2[i].index != orig->index ||
+                   e2[i].flags != orig->flags ||
                    e2[i].eax != orig->eax || e2[i].ebx != orig->ebx ||
                    e2[i].ecx != orig->ecx || e2[i].edx != orig->edx)
                        return -EINVAL;
@@ -196,10 +197,26 @@ void kvm_update_pv_runtime(struct kvm_vcpu *vcpu)
                vcpu->arch.pv_cpuid.features = best->eax;
 }
 
+/*
+ * Calculate guest's supported XCR0 taking into account guest CPUID data and
+ * supported_xcr0 (comprised of host configuration and KVM_SUPPORTED_XCR0).
+ */
+static u64 cpuid_get_supported_xcr0(struct kvm_cpuid_entry2 *entries, int nent)
+{
+       struct kvm_cpuid_entry2 *best;
+
+       best = cpuid_entry2_find(entries, nent, 0xd, 0);
+       if (!best)
+               return 0;
+
+       return (best->eax | ((u64)best->edx << 32)) & supported_xcr0;
+}
+
 static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *entries,
                                       int nent)
 {
        struct kvm_cpuid_entry2 *best;
+       u64 guest_supported_xcr0 = cpuid_get_supported_xcr0(entries, nent);
 
        best = cpuid_entry2_find(entries, nent, 1, 0);
        if (best) {
@@ -238,6 +255,21 @@ static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_e
                                           vcpu->arch.ia32_misc_enable_msr &
                                           MSR_IA32_MISC_ENABLE_MWAIT);
        }
+
+       /*
+        * Bits 127:0 of the allowed SECS.ATTRIBUTES (CPUID.0x12.0x1) enumerate
+        * the supported XSAVE Feature Request Mask (XFRM), i.e. the enclave's
+        * requested XCR0 value.  The enclave's XFRM must be a subset of XCRO
+        * at the time of EENTER, thus adjust the allowed XFRM by the guest's
+        * supported XCR0.  Similar to XCR0 handling, FP and SSE are forced to
+        * '1' even on CPUs that don't support XSAVE.
+        */
+       best = cpuid_entry2_find(entries, nent, 0x12, 0x1);
+       if (best) {
+               best->ecx &= guest_supported_xcr0 & 0xffffffff;
+               best->edx &= guest_supported_xcr0 >> 32;
+               best->ecx |= XFEATURE_MASK_FPSSE;
+       }
 }
 
 void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
@@ -261,27 +293,8 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
                kvm_apic_set_version(vcpu);
        }
 
-       best = kvm_find_cpuid_entry(vcpu, 0xD, 0);
-       if (!best)
-               vcpu->arch.guest_supported_xcr0 = 0;
-       else
-               vcpu->arch.guest_supported_xcr0 =
-                       (best->eax | ((u64)best->edx << 32)) & supported_xcr0;
-
-       /*
-        * Bits 127:0 of the allowed SECS.ATTRIBUTES (CPUID.0x12.0x1) enumerate
-        * the supported XSAVE Feature Request Mask (XFRM), i.e. the enclave's
-        * requested XCR0 value.  The enclave's XFRM must be a subset of XCRO
-        * at the time of EENTER, thus adjust the allowed XFRM by the guest's
-        * supported XCR0.  Similar to XCR0 handling, FP and SSE are forced to
-        * '1' even on CPUs that don't support XSAVE.
-        */
-       best = kvm_find_cpuid_entry(vcpu, 0x12, 0x1);
-       if (best) {
-               best->ecx &= vcpu->arch.guest_supported_xcr0 & 0xffffffff;
-               best->edx &= vcpu->arch.guest_supported_xcr0 >> 32;
-               best->ecx |= XFEATURE_MASK_FPSSE;
-       }
+       vcpu->arch.guest_supported_xcr0 =
+               cpuid_get_supported_xcr0(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent);
 
        kvm_update_pv_runtime(vcpu);
 
@@ -346,8 +359,14 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
         * KVM_SET_CPUID{,2} again. To support this legacy behavior, check
         * whether the supplied CPUID data is equal to what's already set.
         */
-       if (vcpu->arch.last_vmentry_cpu != -1)
-               return kvm_cpuid_check_equal(vcpu, e2, nent);
+       if (vcpu->arch.last_vmentry_cpu != -1) {
+               r = kvm_cpuid_check_equal(vcpu, e2, nent);
+               if (r)
+                       return r;
+
+               kvfree(e2);
+               return 0;
+       }
 
        r = kvm_check_cpuid(vcpu, e2, nent);
        if (r)
@@ -887,13 +906,14 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
                }
                break;
        case 0xd: {
-               u64 guest_perm = xstate_get_guest_group_perm();
+               u64 permitted_xcr0 = supported_xcr0 & xstate_get_guest_group_perm();
+               u64 permitted_xss = supported_xss;
 
-               entry->eax &= supported_xcr0 & guest_perm;
-               entry->ebx = xstate_required_size(supported_xcr0, false);
+               entry->eax &= permitted_xcr0;
+               entry->ebx = xstate_required_size(permitted_xcr0, false);
                entry->ecx = entry->ebx;
-               entry->edx &= (supported_xcr0 & guest_perm) >> 32;
-               if (!supported_xcr0)
+               entry->edx &= permitted_xcr0 >> 32;
+               if (!permitted_xcr0)
                        break;
 
                entry = do_host_cpuid(array, function, 1);
@@ -902,20 +922,20 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 
                cpuid_entry_override(entry, CPUID_D_1_EAX);
                if (entry->eax & (F(XSAVES)|F(XSAVEC)))
-                       entry->ebx = xstate_required_size(supported_xcr0 | supported_xss,
+                       entry->ebx = xstate_required_size(permitted_xcr0 | permitted_xss,
                                                          true);
                else {
-                       WARN_ON_ONCE(supported_xss != 0);
+                       WARN_ON_ONCE(permitted_xss != 0);
                        entry->ebx = 0;
                }
-               entry->ecx &= supported_xss;
-               entry->edx &= supported_xss >> 32;
+               entry->ecx &= permitted_xss;
+               entry->edx &= permitted_xss >> 32;
 
                for (i = 2; i < 64; ++i) {
                        bool s_state;
-                       if (supported_xcr0 & BIT_ULL(i))
+                       if (permitted_xcr0 & BIT_ULL(i))
                                s_state = false;
-                       else if (supported_xss & BIT_ULL(i))
+                       else if (permitted_xss & BIT_ULL(i))
                                s_state = true;
                        else
                                continue;
@@ -929,7 +949,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
                         * invalid sub-leafs.  Only valid sub-leafs should
                         * reach this point, and they should have a non-zero
                         * save state size.  Furthermore, check whether the
-                        * processor agrees with supported_xcr0/supported_xss
+                        * processor agrees with permitted_xcr0/permitted_xss
                         * on whether this is an XCR0- or IA32_XSS-managed area.
                         */
                        if (WARN_ON_ONCE(!entry->eax || (entry->ecx & 0x1) != s_state)) {
index baca9fa..4662469 100644 (file)
@@ -2629,7 +2629,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
        kvm_apic_set_version(vcpu);
 
        apic_update_ppr(apic);
-       hrtimer_cancel(&apic->lapic_timer.timer);
+       cancel_apic_timer(apic);
        apic->lapic_timer.expired_tscdeadline = 0;
        apic_update_lvtt(apic);
        apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
index cf20685..1218b5a 100644 (file)
@@ -983,9 +983,9 @@ void svm_free_nested(struct vcpu_svm *svm)
 /*
  * Forcibly leave nested mode in order to be able to reset the VCPU later on.
  */
-void svm_leave_nested(struct vcpu_svm *svm)
+void svm_leave_nested(struct kvm_vcpu *vcpu)
 {
-       struct kvm_vcpu *vcpu = &svm->vcpu;
+       struct vcpu_svm *svm = to_svm(vcpu);
 
        if (is_guest_mode(vcpu)) {
                svm->nested.nested_run_pending = 0;
@@ -1411,7 +1411,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
                return -EINVAL;
 
        if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) {
-               svm_leave_nested(svm);
+               svm_leave_nested(vcpu);
                svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET));
                return 0;
        }
@@ -1478,7 +1478,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
         */
 
        if (is_guest_mode(vcpu))
-               svm_leave_nested(svm);
+               svm_leave_nested(vcpu);
        else
                svm->nested.vmcb02.ptr->save = svm->vmcb01.ptr->save;
 
@@ -1532,6 +1532,7 @@ static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
 }
 
 struct kvm_x86_nested_ops svm_nested_ops = {
+       .leave_nested = svm_leave_nested,
        .check_events = svm_check_nested_events,
        .triple_fault = nested_svm_triple_fault,
        .get_nested_state_pages = svm_get_nested_state_pages,
index 6a22798..17b5345 100644 (file)
@@ -2100,8 +2100,13 @@ void __init sev_hardware_setup(void)
        if (!sev_enabled || !npt_enabled)
                goto out;
 
-       /* Does the CPU support SEV? */
-       if (!boot_cpu_has(X86_FEATURE_SEV))
+       /*
+        * SEV must obviously be supported in hardware.  Sanity check that the
+        * CPU supports decode assists, which is mandatory for SEV guests to
+        * support instruction emulation.
+        */
+       if (!boot_cpu_has(X86_FEATURE_SEV) ||
+           WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_DECODEASSISTS)))
                goto out;
 
        /* Retrieve SEV CPUID information */
index 2c99b18..6d97629 100644 (file)
@@ -290,7 +290,7 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 
        if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) {
                if (!(efer & EFER_SVME)) {
-                       svm_leave_nested(svm);
+                       svm_leave_nested(vcpu);
                        svm_set_gif(svm, true);
                        /* #GP intercept is still needed for vmware backdoor */
                        if (!enable_vmware_backdoor)
@@ -312,7 +312,11 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
                                return ret;
                        }
 
-                       if (svm_gp_erratum_intercept)
+                       /*
+                        * Never intercept #GP for SEV guests, KVM can't
+                        * decrypt guest memory to workaround the erratum.
+                        */
+                       if (svm_gp_erratum_intercept && !sev_guest(vcpu->kvm))
                                set_exception_intercept(svm, GP_VECTOR);
                }
        }
@@ -1010,9 +1014,10 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
         * Guest access to VMware backdoor ports could legitimately
         * trigger #GP because of TSS I/O permission bitmap.
         * We intercept those #GP and allow access to them anyway
-        * as VMware does.
+        * as VMware does.  Don't intercept #GP for SEV guests as KVM can't
+        * decrypt guest memory to decode the faulting instruction.
         */
-       if (enable_vmware_backdoor)
+       if (enable_vmware_backdoor && !sev_guest(vcpu->kvm))
                set_exception_intercept(svm, GP_VECTOR);
 
        svm_set_intercept(svm, INTERCEPT_INTR);
@@ -2091,10 +2096,6 @@ static int gp_interception(struct kvm_vcpu *vcpu)
        if (error_code)
                goto reinject;
 
-       /* All SVM instructions expect page aligned RAX */
-       if (svm->vmcb->save.rax & ~PAGE_MASK)
-               goto reinject;
-
        /* Decode the instruction for usage later */
        if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK)
                goto reinject;
@@ -2112,8 +2113,13 @@ static int gp_interception(struct kvm_vcpu *vcpu)
                if (!is_guest_mode(vcpu))
                        return kvm_emulate_instruction(vcpu,
                                EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE);
-       } else
+       } else {
+               /* All SVM instructions expect page aligned RAX */
+               if (svm->vmcb->save.rax & ~PAGE_MASK)
+                       goto reinject;
+
                return emulate_svm_instr(vcpu, opcode);
+       }
 
 reinject:
        kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
@@ -4252,79 +4258,140 @@ static void svm_enable_smi_window(struct kvm_vcpu *vcpu)
        }
 }
 
-static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len)
+static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
+                                       void *insn, int insn_len)
 {
        bool smep, smap, is_user;
        unsigned long cr4;
+       u64 error_code;
+
+       /* Emulation is always possible when KVM has access to all guest state. */
+       if (!sev_guest(vcpu->kvm))
+               return true;
+
+       /* #UD and #GP should never be intercepted for SEV guests. */
+       WARN_ON_ONCE(emul_type & (EMULTYPE_TRAP_UD |
+                                 EMULTYPE_TRAP_UD_FORCED |
+                                 EMULTYPE_VMWARE_GP));
 
        /*
-        * When the guest is an SEV-ES guest, emulation is not possible.
+        * Emulation is impossible for SEV-ES guests as KVM doesn't have access
+        * to guest register state.
         */
        if (sev_es_guest(vcpu->kvm))
                return false;
 
+       /*
+        * Emulation is possible if the instruction is already decoded, e.g.
+        * when completing I/O after returning from userspace.
+        */
+       if (emul_type & EMULTYPE_NO_DECODE)
+               return true;
+
+       /*
+        * Emulation is possible for SEV guests if and only if a prefilled
+        * buffer containing the bytes of the intercepted instruction is
+        * available. SEV guest memory is encrypted with a guest specific key
+        * and cannot be decrypted by KVM, i.e. KVM would read cyphertext and
+        * decode garbage.
+        *
+        * Inject #UD if KVM reached this point without an instruction buffer.
+        * In practice, this path should never be hit by a well-behaved guest,
+        * e.g. KVM doesn't intercept #UD or #GP for SEV guests, but this path
+        * is still theoretically reachable, e.g. via unaccelerated fault-like
+        * AVIC access, and needs to be handled by KVM to avoid putting the
+        * guest into an infinite loop.   Injecting #UD is somewhat arbitrary,
+        * but its the least awful option given lack of insight into the guest.
+        */
+       if (unlikely(!insn)) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
+               return false;
+       }
+
+       /*
+        * Emulate for SEV guests if the insn buffer is not empty.  The buffer
+        * will be empty if the DecodeAssist microcode cannot fetch bytes for
+        * the faulting instruction because the code fetch itself faulted, e.g.
+        * the guest attempted to fetch from emulated MMIO or a guest page
+        * table used to translate CS:RIP resides in emulated MMIO.
+        */
+       if (likely(insn_len))
+               return true;
+
        /*
         * Detect and workaround Errata 1096 Fam_17h_00_0Fh.
         *
         * Errata:
-        * When CPU raise #NPF on guest data access and vCPU CR4.SMAP=1, it is
-        * possible that CPU microcode implementing DecodeAssist will fail
-        * to read bytes of instruction which caused #NPF. In this case,
-        * GuestIntrBytes field of the VMCB on a VMEXIT will incorrectly
-        * return 0 instead of the correct guest instruction bytes.
-        *
-        * This happens because CPU microcode reading instruction bytes
-        * uses a special opcode which attempts to read data using CPL=0
-        * privileges. The microcode reads CS:RIP and if it hits a SMAP
-        * fault, it gives up and returns no instruction bytes.
+        * When CPU raises #NPF on guest data access and vCPU CR4.SMAP=1, it is
+        * possible that CPU microcode implementing DecodeAssist will fail to
+        * read guest memory at CS:RIP and vmcb.GuestIntrBytes will incorrectly
+        * be '0'.  This happens because microcode reads CS:RIP using a _data_
+        * loap uop with CPL=0 privileges.  If the load hits a SMAP #PF, ucode
+        * gives up and does not fill the instruction bytes buffer.
         *
-        * Detection:
-        * We reach here in case CPU supports DecodeAssist, raised #NPF and
-        * returned 0 in GuestIntrBytes field of the VMCB.
-        * First, errata can only be triggered in case vCPU CR4.SMAP=1.
-        * Second, if vCPU CR4.SMEP=1, errata could only be triggered
-        * in case vCPU CPL==3 (Because otherwise guest would have triggered
-        * a SMEP fault instead of #NPF).
-        * Otherwise, vCPU CR4.SMEP=0, errata could be triggered by any vCPU CPL.
-        * As most guests enable SMAP if they have also enabled SMEP, use above
-        * logic in order to attempt minimize false-positive of detecting errata
-        * while still preserving all cases semantic correctness.
+        * As above, KVM reaches this point iff the VM is an SEV guest, the CPU
+        * supports DecodeAssist, a #NPF was raised, KVM's page fault handler
+        * triggered emulation (e.g. for MMIO), and the CPU returned 0 in the
+        * GuestIntrBytes field of the VMCB.
         *
-        * Workaround:
-        * To determine what instruction the guest was executing, the hypervisor
-        * will have to decode the instruction at the instruction pointer.
+        * This does _not_ mean that the erratum has been encountered, as the
+        * DecodeAssist will also fail if the load for CS:RIP hits a legitimate
+        * #PF, e.g. if the guest attempt to execute from emulated MMIO and
+        * encountered a reserved/not-present #PF.
         *
-        * In non SEV guest, hypervisor will be able to read the guest
-        * memory to decode the instruction pointer when insn_len is zero
-        * so we return true to indicate that decoding is possible.
+        * To hit the erratum, the following conditions must be true:
+        *    1. CR4.SMAP=1 (obviously).
+        *    2. CR4.SMEP=0 || CPL=3.  If SMEP=1 and CPL<3, the erratum cannot
+        *       have been hit as the guest would have encountered a SMEP
+        *       violation #PF, not a #NPF.
+        *    3. The #NPF is not due to a code fetch, in which case failure to
+        *       retrieve the instruction bytes is legitimate (see abvoe).
         *
-        * But in the SEV guest, the guest memory is encrypted with the
-        * guest specific key and hypervisor will not be able to decode the
-        * instruction pointer so we will not able to workaround it. Lets
-        * print the error and request to kill the guest.
+        * In addition, don't apply the erratum workaround if the #NPF occurred
+        * while translating guest page tables (see below).
         */
-       if (likely(!insn || insn_len))
-               return true;
-
-       /*
-        * If RIP is invalid, go ahead with emulation which will cause an
-        * internal error exit.
-        */
-       if (!kvm_vcpu_gfn_to_memslot(vcpu, kvm_rip_read(vcpu) >> PAGE_SHIFT))
-               return true;
+       error_code = to_svm(vcpu)->vmcb->control.exit_info_1;
+       if (error_code & (PFERR_GUEST_PAGE_MASK | PFERR_FETCH_MASK))
+               goto resume_guest;
 
        cr4 = kvm_read_cr4(vcpu);
        smep = cr4 & X86_CR4_SMEP;
        smap = cr4 & X86_CR4_SMAP;
        is_user = svm_get_cpl(vcpu) == 3;
        if (smap && (!smep || is_user)) {
-               if (!sev_guest(vcpu->kvm))
-                       return true;
-
                pr_err_ratelimited("KVM: SEV Guest triggered AMD Erratum 1096\n");
-               kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+
+               /*
+                * If the fault occurred in userspace, arbitrarily inject #GP
+                * to avoid killing the guest and to hopefully avoid confusing
+                * the guest kernel too much, e.g. injecting #PF would not be
+                * coherent with respect to the guest's page tables.  Request
+                * triple fault if the fault occurred in the kernel as there's
+                * no fault that KVM can inject without confusing the guest.
+                * In practice, the triple fault is moot as no sane SEV kernel
+                * will execute from user memory while also running with SMAP=1.
+                */
+               if (is_user)
+                       kvm_inject_gp(vcpu, 0);
+               else
+                       kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
        }
 
+resume_guest:
+       /*
+        * If the erratum was not hit, simply resume the guest and let it fault
+        * again.  While awful, e.g. the vCPU may get stuck in an infinite loop
+        * if the fault is at CPL=0, it's the lesser of all evils.  Exiting to
+        * userspace will kill the guest, and letting the emulator read garbage
+        * will yield random behavior and potentially corrupt the guest.
+        *
+        * Simply resuming the guest is technically not a violation of the SEV
+        * architecture.  AMD's APM states that all code fetches and page table
+        * accesses for SEV guest are encrypted, regardless of the C-Bit.  The
+        * APM also states that encrypted accesses to MMIO are "ignored", but
+        * doesn't explicitly define "ignored", i.e. doing nothing and letting
+        * the guest spin is technically "ignoring" the access.
+        */
        return false;
 }
 
index 47ef8f4..7352535 100644 (file)
@@ -304,11 +304,6 @@ static inline void vmcb_mark_all_clean(struct vmcb *vmcb)
                               & ~VMCB_ALWAYS_DIRTY_MASK;
 }
 
-static inline bool vmcb_is_clean(struct vmcb *vmcb, int bit)
-{
-       return (vmcb->control.clean & (1 << bit));
-}
-
 static inline void vmcb_mark_dirty(struct vmcb *vmcb, int bit)
 {
        vmcb->control.clean &= ~(1 << bit);
@@ -525,7 +520,7 @@ static inline bool nested_exit_on_nmi(struct vcpu_svm *svm)
 
 int enter_svm_guest_mode(struct kvm_vcpu *vcpu,
                         u64 vmcb_gpa, struct vmcb *vmcb12, bool from_vmrun);
-void svm_leave_nested(struct vcpu_svm *svm);
+void svm_leave_nested(struct kvm_vcpu *vcpu);
 void svm_free_nested(struct vcpu_svm *svm);
 int svm_allocate_nested(struct vcpu_svm *svm);
 int nested_svm_vmrun(struct kvm_vcpu *vcpu);
index c53b8bf..489ca56 100644 (file)
@@ -46,6 +46,9 @@ static inline void svm_hv_init_vmcb(struct vmcb *vmcb)
        if (npt_enabled &&
            ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB)
                hve->hv_enlightenments_control.enlightened_npt_tlb = 1;
+
+       if (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)
+               hve->hv_enlightenments_control.msr_bitmap = 1;
 }
 
 static inline void svm_hv_hardware_setup(void)
@@ -83,14 +86,7 @@ static inline void svm_hv_vmcb_dirty_nested_enlightenments(
        struct hv_enlightenments *hve =
                (struct hv_enlightenments *)vmcb->control.reserved_sw;
 
-       /*
-        * vmcb can be NULL if called during early vcpu init.
-        * And its okay not to mark vmcb dirty during vcpu init
-        * as we mark it dirty unconditionally towards end of vcpu
-        * init phase.
-        */
-       if (vmcb_is_clean(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS) &&
-           hve->hv_enlightenments_control.msr_bitmap)
+       if (hve->hv_enlightenments_control.msr_bitmap)
                vmcb_mark_dirty(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS);
 }
 
index 959b59d..3f430e2 100644 (file)
@@ -54,7 +54,6 @@ struct nested_vmx_msrs {
 
 struct vmcs_config {
        int size;
-       int order;
        u32 basic_cap;
        u32 revision_id;
        u32 pin_based_exec_ctrl;
index ba6f99f..87e3dc1 100644 (file)
@@ -12,8 +12,6 @@
 
 DEFINE_STATIC_KEY_FALSE(enable_evmcs);
 
-#if IS_ENABLED(CONFIG_HYPERV)
-
 #define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x)
 #define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \
                {EVMCS1_OFFSET(name), clean_field}
@@ -296,6 +294,7 @@ const struct evmcs_field vmcs_field_to_evmcs_1[] = {
 };
 const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1);
 
+#if IS_ENABLED(CONFIG_HYPERV)
 __init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
 {
        vmcs_conf->pin_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_PINCTRL;
@@ -362,6 +361,7 @@ void nested_evmcs_filter_control_msr(u32 msr_index, u64 *pdata)
        case MSR_IA32_VMX_PROCBASED_CTLS2:
                ctl_high &= ~EVMCS1_UNSUPPORTED_2NDEXEC;
                break;
+       case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
        case MSR_IA32_VMX_PINBASED_CTLS:
                ctl_high &= ~EVMCS1_UNSUPPORTED_PINCTRL;
                break;
index 16731d2..8d70f9a 100644 (file)
@@ -59,12 +59,12 @@ DECLARE_STATIC_KEY_FALSE(enable_evmcs);
         SECONDARY_EXEC_SHADOW_VMCS |                                   \
         SECONDARY_EXEC_TSC_SCALING |                                   \
         SECONDARY_EXEC_PAUSE_LOOP_EXITING)
-#define EVMCS1_UNSUPPORTED_VMEXIT_CTRL (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
+#define EVMCS1_UNSUPPORTED_VMEXIT_CTRL                                 \
+       (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL |                           \
+        VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
 #define EVMCS1_UNSUPPORTED_VMENTRY_CTRL (VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
 #define EVMCS1_UNSUPPORTED_VMFUNC (VMX_VMFUNC_EPTP_SWITCHING)
 
-#if IS_ENABLED(CONFIG_HYPERV)
-
 struct evmcs_field {
        u16 offset;
        u16 clean_field;
@@ -73,26 +73,56 @@ struct evmcs_field {
 extern const struct evmcs_field vmcs_field_to_evmcs_1[];
 extern const unsigned int nr_evmcs_1_fields;
 
-static __always_inline int get_evmcs_offset(unsigned long field,
-                                           u16 *clean_field)
+static __always_inline int evmcs_field_offset(unsigned long field,
+                                             u16 *clean_field)
 {
        unsigned int index = ROL16(field, 6);
        const struct evmcs_field *evmcs_field;
 
-       if (unlikely(index >= nr_evmcs_1_fields)) {
-               WARN_ONCE(1, "KVM: accessing unsupported EVMCS field %lx\n",
-                         field);
+       if (unlikely(index >= nr_evmcs_1_fields))
                return -ENOENT;
-       }
 
        evmcs_field = &vmcs_field_to_evmcs_1[index];
 
+       /*
+        * Use offset=0 to detect holes in eVMCS. This offset belongs to
+        * 'revision_id' but this field has no encoding and is supposed to
+        * be accessed directly.
+        */
+       if (unlikely(!evmcs_field->offset))
+               return -ENOENT;
+
        if (clean_field)
                *clean_field = evmcs_field->clean_field;
 
        return evmcs_field->offset;
 }
 
+static inline u64 evmcs_read_any(struct hv_enlightened_vmcs *evmcs,
+                                unsigned long field, u16 offset)
+{
+       /*
+        * vmcs12_read_any() doesn't care whether the supplied structure
+        * is 'struct vmcs12' or 'struct hv_enlightened_vmcs' as it takes
+        * the exact offset of the required field, use it for convenience
+        * here.
+        */
+       return vmcs12_read_any((void *)evmcs, field, offset);
+}
+
+#if IS_ENABLED(CONFIG_HYPERV)
+
+static __always_inline int get_evmcs_offset(unsigned long field,
+                                           u16 *clean_field)
+{
+       int offset = evmcs_field_offset(field, clean_field);
+
+       WARN_ONCE(offset < 0, "KVM: accessing unsupported EVMCS field %lx\n",
+                 field);
+
+       return offset;
+}
+
 static __always_inline void evmcs_write64(unsigned long field, u64 value)
 {
        u16 clean_field;
index f235f77..ba34e94 100644 (file)
@@ -7,6 +7,7 @@
 #include <asm/mmu_context.h>
 
 #include "cpuid.h"
+#include "evmcs.h"
 #include "hyperv.h"
 #include "mmu.h"
 #include "nested.h"
@@ -4851,18 +4852,20 @@ static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu)
        struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs;
 
        /*
-        * We should allocate a shadow vmcs for vmcs01 only when L1
-        * executes VMXON and free it when L1 executes VMXOFF.
-        * As it is invalid to execute VMXON twice, we shouldn't reach
-        * here when vmcs01 already have an allocated shadow vmcs.
+        * KVM allocates a shadow VMCS only when L1 executes VMXON and frees it
+        * when L1 executes VMXOFF or the vCPU is forced out of nested
+        * operation.  VMXON faults if the CPU is already post-VMXON, so it
+        * should be impossible to already have an allocated shadow VMCS.  KVM
+        * doesn't support virtualization of VMCS shadowing, so vmcs01 should
+        * always be the loaded VMCS.
         */
-       WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs);
+       if (WARN_ON(loaded_vmcs != &vmx->vmcs01 || loaded_vmcs->shadow_vmcs))
+               return loaded_vmcs->shadow_vmcs;
+
+       loaded_vmcs->shadow_vmcs = alloc_vmcs(true);
+       if (loaded_vmcs->shadow_vmcs)
+               vmcs_clear(loaded_vmcs->shadow_vmcs);
 
-       if (!loaded_vmcs->shadow_vmcs) {
-               loaded_vmcs->shadow_vmcs = alloc_vmcs(true);
-               if (loaded_vmcs->shadow_vmcs)
-                       vmcs_clear(loaded_vmcs->shadow_vmcs);
-       }
        return loaded_vmcs->shadow_vmcs;
 }
 
@@ -5099,27 +5102,49 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
        if (!nested_vmx_check_permission(vcpu))
                return 1;
 
-       /*
-        * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA,
-        * any VMREAD sets the ALU flags for VMfailInvalid.
-        */
-       if (vmx->nested.current_vmptr == INVALID_GPA ||
-           (is_guest_mode(vcpu) &&
-            get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA))
-               return nested_vmx_failInvalid(vcpu);
-
        /* Decode instruction info and find the field to read */
        field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf));
 
-       offset = vmcs_field_to_offset(field);
-       if (offset < 0)
-               return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+       if (!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
+               /*
+                * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA,
+                * any VMREAD sets the ALU flags for VMfailInvalid.
+                */
+               if (vmx->nested.current_vmptr == INVALID_GPA ||
+                   (is_guest_mode(vcpu) &&
+                    get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA))
+                       return nested_vmx_failInvalid(vcpu);
 
-       if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field))
-               copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
+               offset = get_vmcs12_field_offset(field);
+               if (offset < 0)
+                       return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+
+               if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field))
+                       copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
 
-       /* Read the field, zero-extended to a u64 value */
-       value = vmcs12_read_any(vmcs12, field, offset);
+               /* Read the field, zero-extended to a u64 value */
+               value = vmcs12_read_any(vmcs12, field, offset);
+       } else {
+               /*
+                * Hyper-V TLFS (as of 6.0b) explicitly states, that while an
+                * enlightened VMCS is active VMREAD/VMWRITE instructions are
+                * unsupported. Unfortunately, certain versions of Windows 11
+                * don't comply with this requirement which is not enforced in
+                * genuine Hyper-V. Allow VMREAD from an enlightened VMCS as a
+                * workaround, as misbehaving guests will panic on VM-Fail.
+                * Note, enlightened VMCS is incompatible with shadow VMCS so
+                * all VMREADs from L2 should go to L1.
+                */
+               if (WARN_ON_ONCE(is_guest_mode(vcpu)))
+                       return nested_vmx_failInvalid(vcpu);
+
+               offset = evmcs_field_offset(field, NULL);
+               if (offset < 0)
+                       return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+
+               /* Read the field, zero-extended to a u64 value */
+               value = evmcs_read_any(vmx->nested.hv_evmcs, field, offset);
+       }
 
        /*
         * Now copy part of this value to register or memory, as requested.
@@ -5214,7 +5239,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
 
        field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf));
 
-       offset = vmcs_field_to_offset(field);
+       offset = get_vmcs12_field_offset(field);
        if (offset < 0)
                return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
 
@@ -6462,7 +6487,7 @@ static u64 nested_vmx_calc_vmcs_enum_msr(void)
        max_idx = 0;
        for (i = 0; i < nr_vmcs12_fields; i++) {
                /* The vmcs12 table is very, very sparsely populated. */
-               if (!vmcs_field_to_offset_table[i])
+               if (!vmcs12_field_offsets[i])
                        continue;
 
                idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i));
@@ -6771,6 +6796,7 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
 }
 
 struct kvm_x86_nested_ops vmx_nested_ops = {
+       .leave_nested = vmx_leave_nested,
        .check_events = vmx_check_nested_events,
        .hv_timer_pending = nested_vmx_preemption_timer_pending,
        .triple_fault = nested_vmx_triple_fault,
index cab6ba7..2251b60 100644 (file)
@@ -8,7 +8,7 @@
        FIELD(number, name),                                            \
        [ROL16(number##_HIGH, 6)] = VMCS12_OFFSET(name) + sizeof(u32)
 
-const unsigned short vmcs_field_to_offset_table[] = {
+const unsigned short vmcs12_field_offsets[] = {
        FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
        FIELD(POSTED_INTR_NV, posted_intr_nv),
        FIELD(GUEST_ES_SELECTOR, guest_es_selector),
@@ -151,4 +151,4 @@ const unsigned short vmcs_field_to_offset_table[] = {
        FIELD(HOST_RSP, host_rsp),
        FIELD(HOST_RIP, host_rip),
 };
-const unsigned int nr_vmcs12_fields = ARRAY_SIZE(vmcs_field_to_offset_table);
+const unsigned int nr_vmcs12_fields = ARRAY_SIZE(vmcs12_field_offsets);
index 2a45f02..746129d 100644 (file)
@@ -361,10 +361,10 @@ static inline void vmx_check_vmcs12_offsets(void)
        CHECK_OFFSET(guest_pml_index, 996);
 }
 
-extern const unsigned short vmcs_field_to_offset_table[];
+extern const unsigned short vmcs12_field_offsets[];
 extern const unsigned int nr_vmcs12_fields;
 
-static inline short vmcs_field_to_offset(unsigned long field)
+static inline short get_vmcs12_field_offset(unsigned long field)
 {
        unsigned short offset;
        unsigned int index;
@@ -377,7 +377,7 @@ static inline short vmcs_field_to_offset(unsigned long field)
                return -ENOENT;
 
        index = array_index_nospec(index, nr_vmcs12_fields);
-       offset = vmcs_field_to_offset_table[index];
+       offset = vmcs12_field_offsets[index];
        if (offset == 0)
                return -ENOENT;
        return offset;
index 4ac6760..aca3ae2 100644 (file)
@@ -1487,11 +1487,12 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
        return 0;
 }
 
-static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len)
+static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
+                                       void *insn, int insn_len)
 {
        /*
         * Emulation of instructions in SGX enclaves is impossible as RIP does
-        * not point  tthe failing instruction, and even if it did, the code
+        * not point at the failing instruction, and even if it did, the code
         * stream is inaccessible.  Inject #UD instead of exiting to userspace
         * so that guest userspace can't DoS the guest simply by triggering
         * emulation (enclaves are CPL3 only).
@@ -2603,7 +2604,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
                return -EIO;
 
        vmcs_conf->size = vmx_msr_high & 0x1fff;
-       vmcs_conf->order = get_order(vmcs_conf->size);
        vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
 
        vmcs_conf->revision_id = vmx_msr_low;
@@ -2628,7 +2628,7 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
        struct page *pages;
        struct vmcs *vmcs;
 
-       pages = __alloc_pages_node(node, flags, vmcs_config.order);
+       pages = __alloc_pages_node(node, flags, 0);
        if (!pages)
                return NULL;
        vmcs = page_address(pages);
@@ -2647,7 +2647,7 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
 
 void free_vmcs(struct vmcs *vmcs)
 {
-       free_pages((unsigned long)vmcs, vmcs_config.order);
+       free_page((unsigned long)vmcs);
 }
 
 /*
@@ -4094,10 +4094,14 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
        vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
 
        /*
-        * If 32-bit syscall is enabled, vmx_vcpu_load_vcms rewrites
-        * HOST_IA32_SYSENTER_ESP.
+        * SYSENTER is used for 32-bit system calls on either 32-bit or
+        * 64-bit kernels.  It is always zero If neither is allowed, otherwise
+        * vmx_vcpu_load_vmcs loads it with the per-CPU entry stack (and may
+        * have already done so!).
         */
-       vmcs_writel(HOST_IA32_SYSENTER_ESP, 0);
+       if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32))
+               vmcs_writel(HOST_IA32_SYSENTER_ESP, 0);
+
        rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
        vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl);   /* 22.2.3 */
 
@@ -4901,8 +4905,33 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
                dr6 = vmx_get_exit_qual(vcpu);
                if (!(vcpu->guest_debug &
                      (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
+                       /*
+                        * If the #DB was due to ICEBP, a.k.a. INT1, skip the
+                        * instruction.  ICEBP generates a trap-like #DB, but
+                        * despite its interception control being tied to #DB,
+                        * is an instruction intercept, i.e. the VM-Exit occurs
+                        * on the ICEBP itself.  Note, skipping ICEBP also
+                        * clears STI and MOVSS blocking.
+                        *
+                        * For all other #DBs, set vmcs.PENDING_DBG_EXCEPTIONS.BS
+                        * if single-step is enabled in RFLAGS and STI or MOVSS
+                        * blocking is active, as the CPU doesn't set the bit
+                        * on VM-Exit due to #DB interception.  VM-Entry has a
+                        * consistency check that a single-step #DB is pending
+                        * in this scenario as the previous instruction cannot
+                        * have toggled RFLAGS.TF 0=>1 (because STI and POP/MOV
+                        * don't modify RFLAGS), therefore the one instruction
+                        * delay when activating single-step breakpoints must
+                        * have already expired.  Note, the CPU sets/clears BS
+                        * as appropriate for all other VM-Exits types.
+                        */
                        if (is_icebp(intr_info))
                                WARN_ON(!skip_emulated_instruction(vcpu));
+                       else if ((vmx_get_rflags(vcpu) & X86_EFLAGS_TF) &&
+                                (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+                                 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)))
+                               vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
+                                           vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS) | DR6_BS);
 
                        kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
                        return 1;
@@ -5397,7 +5426,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
 {
        gpa_t gpa;
 
-       if (!vmx_can_emulate_instruction(vcpu, NULL, 0))
+       if (!vmx_can_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0))
                return 1;
 
        /*
index 9e43d75..74b53a1 100644 (file)
@@ -3535,6 +3535,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                if (data & ~supported_xss)
                        return 1;
                vcpu->arch.ia32_xss = data;
+               kvm_update_cpuid_runtime(vcpu);
                break;
        case MSR_SMI_COUNT:
                if (!msr_info->host_initiated)
@@ -4229,6 +4230,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_SREGS2:
        case KVM_CAP_EXIT_ON_EMULATION_FAILURE:
        case KVM_CAP_VCPU_ATTRIBUTES:
+       case KVM_CAP_SYS_ATTRIBUTES:
                r = 1;
                break;
        case KVM_CAP_EXIT_HYPERCALL:
@@ -4331,7 +4333,49 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
                break;
        }
        return r;
+}
+
+static inline void __user *kvm_get_attr_addr(struct kvm_device_attr *attr)
+{
+       void __user *uaddr = (void __user*)(unsigned long)attr->addr;
 
+       if ((u64)(unsigned long)uaddr != attr->addr)
+               return ERR_PTR(-EFAULT);
+       return uaddr;
+}
+
+static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr)
+{
+       u64 __user *uaddr = kvm_get_attr_addr(attr);
+
+       if (attr->group)
+               return -ENXIO;
+
+       if (IS_ERR(uaddr))
+               return PTR_ERR(uaddr);
+
+       switch (attr->attr) {
+       case KVM_X86_XCOMP_GUEST_SUPP:
+               if (put_user(supported_xcr0, uaddr))
+                       return -EFAULT;
+               return 0;
+       default:
+               return -ENXIO;
+               break;
+       }
+}
+
+static int kvm_x86_dev_has_attr(struct kvm_device_attr *attr)
+{
+       if (attr->group)
+               return -ENXIO;
+
+       switch (attr->attr) {
+       case KVM_X86_XCOMP_GUEST_SUPP:
+               return 0;
+       default:
+               return -ENXIO;
+       }
 }
 
 long kvm_arch_dev_ioctl(struct file *filp,
@@ -4422,6 +4466,22 @@ long kvm_arch_dev_ioctl(struct file *filp,
        case KVM_GET_SUPPORTED_HV_CPUID:
                r = kvm_ioctl_get_supported_hv_cpuid(NULL, argp);
                break;
+       case KVM_GET_DEVICE_ATTR: {
+               struct kvm_device_attr attr;
+               r = -EFAULT;
+               if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
+                       break;
+               r = kvm_x86_dev_get_attr(&attr);
+               break;
+       }
+       case KVM_HAS_DEVICE_ATTR: {
+               struct kvm_device_attr attr;
+               r = -EFAULT;
+               if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
+                       break;
+               r = kvm_x86_dev_has_attr(&attr);
+               break;
+       }
        default:
                r = -EINVAL;
                break;
@@ -4860,8 +4920,10 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
                vcpu->arch.apic->sipi_vector = events->sipi_vector;
 
        if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
-               if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm)
+               if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) {
+                       kvm_x86_ops.nested_ops->leave_nested(vcpu);
                        kvm_smm_changed(vcpu, events->smi.smm);
+               }
 
                vcpu->arch.smi_pending = events->smi.pending;
 
@@ -5022,11 +5084,11 @@ static int kvm_arch_tsc_has_attr(struct kvm_vcpu *vcpu,
 static int kvm_arch_tsc_get_attr(struct kvm_vcpu *vcpu,
                                 struct kvm_device_attr *attr)
 {
-       u64 __user *uaddr = (u64 __user *)(unsigned long)attr->addr;
+       u64 __user *uaddr = kvm_get_attr_addr(attr);
        int r;
 
-       if ((u64)(unsigned long)uaddr != attr->addr)
-               return -EFAULT;
+       if (IS_ERR(uaddr))
+               return PTR_ERR(uaddr);
 
        switch (attr->attr) {
        case KVM_VCPU_TSC_OFFSET:
@@ -5045,12 +5107,12 @@ static int kvm_arch_tsc_get_attr(struct kvm_vcpu *vcpu,
 static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu,
                                 struct kvm_device_attr *attr)
 {
-       u64 __user *uaddr = (u64 __user *)(unsigned long)attr->addr;
+       u64 __user *uaddr = kvm_get_attr_addr(attr);
        struct kvm *kvm = vcpu->kvm;
        int r;
 
-       if ((u64)(unsigned long)uaddr != attr->addr)
-               return -EFAULT;
+       if (IS_ERR(uaddr))
+               return PTR_ERR(uaddr);
 
        switch (attr->attr) {
        case KVM_VCPU_TSC_OFFSET: {
@@ -6810,6 +6872,13 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
 }
 EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
 
+static int kvm_can_emulate_insn(struct kvm_vcpu *vcpu, int emul_type,
+                               void *insn, int insn_len)
+{
+       return static_call(kvm_x86_can_emulate_instruction)(vcpu, emul_type,
+                                                           insn, insn_len);
+}
+
 int handle_ud(struct kvm_vcpu *vcpu)
 {
        static const char kvm_emulate_prefix[] = { __KVM_EMULATE_PREFIX };
@@ -6817,7 +6886,7 @@ int handle_ud(struct kvm_vcpu *vcpu)
        char sig[5]; /* ud2; .ascii "kvm" */
        struct x86_exception e;
 
-       if (unlikely(!static_call(kvm_x86_can_emulate_instruction)(vcpu, NULL, 0)))
+       if (unlikely(!kvm_can_emulate_insn(vcpu, emul_type, NULL, 0)))
                return 1;
 
        if (force_emulation_prefix &&
@@ -8193,7 +8262,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
        bool writeback = true;
        bool write_fault_to_spt;
 
-       if (unlikely(!static_call(kvm_x86_can_emulate_instruction)(vcpu, insn, insn_len)))
+       if (unlikely(!kvm_can_emulate_insn(vcpu, emulation_type, insn, insn_len)))
                return 1;
 
        vcpu->arch.l1tf_flush_l1d = true;
@@ -9706,7 +9775,7 @@ void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
                kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
 }
 
-void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
+static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
 {
        if (!lapic_in_kernel(vcpu))
                return;
@@ -11209,7 +11278,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 
                vcpu->arch.msr_misc_features_enables = 0;
 
-               vcpu->arch.xcr0 = XFEATURE_MASK_FP;
+               __kvm_set_xcr(vcpu, 0, XFEATURE_MASK_FP);
+               __kvm_set_msr(vcpu, MSR_IA32_XSS, 0, true);
        }
 
        /* All GPRs except RDX (handled below) are zeroed on RESET/INIT. */
@@ -11226,8 +11296,6 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
        cpuid_0x1 = kvm_find_cpuid_entry(vcpu, 1, 0);
        kvm_rdx_write(vcpu, cpuid_0x1 ? cpuid_0x1->eax : 0x600);
 
-       vcpu->arch.ia32_xss = 0;
-
        static_call(kvm_x86_vcpu_reset)(vcpu, init_event);
 
        kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
index 0e3f7d6..bad5753 100644 (file)
@@ -316,10 +316,7 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
                                     "\tnotq %0\n"
                                     "\t" LOCK_PREFIX "andq %0, %2\n"
                                     "2:\n"
-                                    "\t.section .fixup,\"ax\"\n"
-                                    "3:\tjmp\t2b\n"
-                                    "\t.previous\n"
-                                    _ASM_EXTABLE_UA(1b, 3b)
+                                    _ASM_EXTABLE_UA(1b, 2b)
                                     : "=r" (evtchn_pending_sel),
                                       "+m" (vi->evtchn_pending_sel),
                                       "+m" (v->arch.xen.evtchn_pending_sel)
@@ -335,10 +332,7 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
                                     "\tnotl %0\n"
                                     "\t" LOCK_PREFIX "andl %0, %2\n"
                                     "2:\n"
-                                    "\t.section .fixup,\"ax\"\n"
-                                    "3:\tjmp\t2b\n"
-                                    "\t.previous\n"
-                                    _ASM_EXTABLE_UA(1b, 3b)
+                                    _ASM_EXTABLE_UA(1b, 2b)
                                     : "=r" (evtchn_pending_sel32),
                                       "+m" (vi->evtchn_pending_sel),
                                       "+m" (v->arch.xen.evtchn_pending_sel)
index 9563d29..b46bcdb 100644 (file)
@@ -1133,6 +1133,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM 206
 #define KVM_CAP_VM_GPA_BITS 207
 #define KVM_CAP_XSAVE2 208
+#define KVM_CAP_SYS_ATTRIBUTES 209
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
index 2da3316..bf6e960 100644 (file)
@@ -452,6 +452,9 @@ struct kvm_sync_regs {
 
 #define KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE        0x00000001
 
+/* attributes for system fd (group 0) */
+#define KVM_X86_XCOMP_GUEST_SUPP       0
+
 struct kvm_vmx_nested_state_data {
        __u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
        __u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
index 9563d29..b46bcdb 100644 (file)
@@ -1133,6 +1133,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM 206
 #define KVM_CAP_VM_GPA_BITS 207
 #define KVM_CAP_XSAVE2 208
+#define KVM_CAP_SYS_ATTRIBUTES 209
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
index 81ebf99..0e4926b 100644 (file)
@@ -85,6 +85,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/xen_vmcall_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_pi_mmio_test
 TEST_GEN_PROGS_x86_64 += x86_64/sev_migrate_tests
 TEST_GEN_PROGS_x86_64 += x86_64/amx_test
+TEST_GEN_PROGS_x86_64 += access_tracking_perf_test
 TEST_GEN_PROGS_x86_64 += demand_paging_test
 TEST_GEN_PROGS_x86_64 += dirty_log_test
 TEST_GEN_PROGS_x86_64 += dirty_log_perf_test
index 66775de..4ed6aa0 100644 (file)
@@ -345,7 +345,6 @@ struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus,
  *   guest_code - The vCPU's entry point
  */
 void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code);
-void vm_xsave_req_perm(void);
 
 bool vm_is_unrestricted_guest(struct kvm_vm *vm);
 
index 423d8a6..8a470da 100644 (file)
@@ -458,6 +458,7 @@ uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2,
 struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void);
 void vcpu_set_hv_cpuid(struct kvm_vm *vm, uint32_t vcpuid);
 struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vm *vm, uint32_t vcpuid);
+void vm_xsave_req_perm(int bit);
 
 enum x86_page_size {
        X86_PAGE_SIZE_4K = 0,
index 8c53f96..d8cf851 100644 (file)
@@ -393,13 +393,6 @@ struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus,
        struct kvm_vm *vm;
        int i;
 
-#ifdef __x86_64__
-       /*
-        * Permission needs to be requested before KVM_SET_CPUID2.
-        */
-       vm_xsave_req_perm();
-#endif
-
        /* Force slot0 memory size not small than DEFAULT_GUEST_PHY_PAGES */
        if (slot0_mem_pages < DEFAULT_GUEST_PHY_PAGES)
                slot0_mem_pages = DEFAULT_GUEST_PHY_PAGES;
index 5f9d7e9..9f000df 100644 (file)
@@ -665,16 +665,31 @@ static bool is_xfd_supported(void)
        return !!(eax & CPUID_XFD_BIT);
 }
 
-void vm_xsave_req_perm(void)
+void vm_xsave_req_perm(int bit)
 {
-       unsigned long bitmask;
+       int kvm_fd;
+       u64 bitmask;
        long rc;
+       struct kvm_device_attr attr = {
+               .group = 0,
+               .attr = KVM_X86_XCOMP_GUEST_SUPP,
+               .addr = (unsigned long) &bitmask
+       };
+
+       kvm_fd = open_kvm_dev_path_or_exit();
+       rc = ioctl(kvm_fd, KVM_GET_DEVICE_ATTR, &attr);
+       close(kvm_fd);
+       if (rc == -1 && (errno == ENXIO || errno == EINVAL))
+               exit(KSFT_SKIP);
+       TEST_ASSERT(rc == 0, "KVM_GET_DEVICE_ATTR(0, KVM_X86_XCOMP_GUEST_SUPP) error: %ld", rc);
+       if (!(bitmask & (1ULL << bit)))
+               exit(KSFT_SKIP);
 
        if (!is_xfd_supported())
-               return;
+               exit(KSFT_SKIP);
+
+       rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_GUEST_PERM, bit);
 
-       rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_GUEST_PERM,
-                    XSTATE_XTILE_DATA_BIT);
        /*
         * The older kernel version(<5.15) can't support
         * ARCH_REQ_XCOMP_GUEST_PERM and directly return.
@@ -684,7 +699,7 @@ void vm_xsave_req_perm(void)
 
        rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &bitmask);
        TEST_ASSERT(rc == 0, "prctl(ARCH_GET_XCOMP_GUEST_PERM) error: %ld", rc);
-       TEST_ASSERT(bitmask & XFEATURE_XTILE_MASK,
+       TEST_ASSERT(bitmask & (1ULL << bit),
                    "prctl(ARCH_REQ_XCOMP_GUEST_PERM) failure bitmask=0x%lx",
                    bitmask);
 }
index 523c1e9..52a3ef6 100644 (file)
@@ -329,6 +329,8 @@ int main(int argc, char *argv[])
        u32 amx_offset;
        int stage, ret;
 
+       vm_xsave_req_perm(XSTATE_XTILE_DATA_BIT);
+
        /* Create VM */
        vm = vm_create_default(VCPU_ID, 0, guest_code);
 
index 2da8eb8..a626d40 100644 (file)
@@ -105,7 +105,6 @@ static void guest_code(void *arg)
 
                if (cpu_has_svm()) {
                        run_guest(svm->vmcb, svm->vmcb_gpa);
-                       svm->vmcb->save.rip += 3;
                        run_guest(svm->vmcb, svm->vmcb_gpa);
                } else {
                        vmlaunch();
index 2ad013b..59b1dd4 100644 (file)
@@ -463,8 +463,8 @@ bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin)
        idx = srcu_read_lock(&kvm->irq_srcu);
        gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
        if (gsi != -1)
-               hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list,
-                                        link)
+               hlist_for_each_entry_srcu(kian, &kvm->irq_ack_notifier_list,
+                                         link, srcu_read_lock_held(&kvm->irq_srcu))
                        if (kian->gsi == gsi) {
                                srcu_read_unlock(&kvm->irq_srcu, idx);
                                return true;
@@ -480,8 +480,8 @@ void kvm_notify_acked_gsi(struct kvm *kvm, int gsi)
 {
        struct kvm_irq_ack_notifier *kian;
 
-       hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list,
-                                link)
+       hlist_for_each_entry_srcu(kian, &kvm->irq_ack_notifier_list,
+                                 link, srcu_read_lock_held(&kvm->irq_srcu))
                if (kian->gsi == gsi)
                        kian->irq_acked(kian);
 }
index 9a20f22..58d31da 100644 (file)
@@ -2248,7 +2248,6 @@ struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn
 
        return NULL;
 }
-EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_memslot);
 
 bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
 {
@@ -2463,9 +2462,8 @@ static int kvm_try_get_pfn(kvm_pfn_t pfn)
 }
 
 static int hva_to_pfn_remapped(struct vm_area_struct *vma,
-                              unsigned long addr, bool *async,
-                              bool write_fault, bool *writable,
-                              kvm_pfn_t *p_pfn)
+                              unsigned long addr, bool write_fault,
+                              bool *writable, kvm_pfn_t *p_pfn)
 {
        kvm_pfn_t pfn;
        pte_t *ptep;
@@ -2575,7 +2573,7 @@ retry:
        if (vma == NULL)
                pfn = KVM_PFN_ERR_FAULT;
        else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
-               r = hva_to_pfn_remapped(vma, addr, async, write_fault, writable, &pfn);
+               r = hva_to_pfn_remapped(vma, addr, write_fault, writable, &pfn);
                if (r == -EAGAIN)
                        goto retry;
                if (r < 0)