Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

author Linus Torvalds <torvalds@linux-foundation.org>

Fri, 28 Jan 2022 17:00:26 +0000 (19:00 +0200)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 28 Jan 2022 17:00:26 +0000 (19:00 +0200)
author Linus Torvalds <torvalds@linux-foundation.org>
Fri, 28 Jan 2022 17:00:26 +0000 (19:00 +0200)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 28 Jan 2022 17:00:26 +0000 (19:00 +0200)
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst

index bb8cfdd..a426710 100644 (file)
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -3268,6 +3268,7 @@ number.
  
  :Capability: KVM_CAP_DEVICE_CTRL, KVM_CAP_VM_ATTRIBUTES for vm device,
               KVM_CAP_VCPU_ATTRIBUTES for vcpu device
+             KVM_CAP_SYS_ATTRIBUTES for system (/dev/kvm) device (no set)
  :Type: device ioctl, vm ioctl, vcpu ioctl
  :Parameters: struct kvm_device_attr
  :Returns: 0 on success, -1 on error
@@ -3302,7 +3303,8 @@ transferred is defined by the particular attribute.
  ------------------------
  
  :Capability: KVM_CAP_DEVICE_CTRL, KVM_CAP_VM_ATTRIBUTES for vm device,
-            KVM_CAP_VCPU_ATTRIBUTES for vcpu device
+             KVM_CAP_VCPU_ATTRIBUTES for vcpu device
+             KVM_CAP_SYS_ATTRIBUTES for system (/dev/kvm) device
  :Type: device ioctl, vm ioctl, vcpu ioctl
  :Parameters: struct kvm_device_attr
  :Returns: 0 on success, -1 on error
diff --git a/arch/arm64/kvm/hyp/exception.c b/arch/arm64/kvm/hyp/exception.c

index 0418399..c5d0097 100644 (file)
--- a/arch/arm64/kvm/hyp/exception.c
+++ b/arch/arm64/kvm/hyp/exception.c
@@ -38,7 +38,10 @@ static inline void __vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
  
  static void __vcpu_write_spsr(struct kvm_vcpu *vcpu, u64 val)
  {
-       write_sysreg_el1(val, SYS_SPSR);
+       if (has_vhe())
+               write_sysreg_el1(val, SYS_SPSR);
+       else
+               __vcpu_sys_reg(vcpu, SPSR_EL1) = val;
  }
  
  static void __vcpu_write_spsr_abt(struct kvm_vcpu *vcpu, u64 val)
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c

index 844a6f0..2cb3867 100644 (file)
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -983,13 +983,9 @@ static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
          */
         stage2_put_pte(ptep, mmu, addr, level, mm_ops);
  
-       if (need_flush) {
-               kvm_pte_t *pte_follow = kvm_pte_follow(pte, mm_ops);
-
-               dcache_clean_inval_poc((unsigned long)pte_follow,
-                                   (unsigned long)pte_follow +
-                                           kvm_granule_size(level));
-       }
+       if (need_flush && mm_ops->dcache_clean_inval_poc)
+               mm_ops->dcache_clean_inval_poc(kvm_pte_follow(pte, mm_ops),
+                                              kvm_granule_size(level));
  
         if (childp)
                 mm_ops->put_page(childp);
@@ -1151,15 +1147,13 @@ static int stage2_flush_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
         struct kvm_pgtable *pgt = arg;
         struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops;
         kvm_pte_t pte = *ptep;
-       kvm_pte_t *pte_follow;
  
         if (!kvm_pte_valid(pte) || !stage2_pte_cacheable(pgt, pte))
                 return 0;
  
-       pte_follow = kvm_pte_follow(pte, mm_ops);
-       dcache_clean_inval_poc((unsigned long)pte_follow,
-                           (unsigned long)pte_follow +
-                                   kvm_granule_size(level));
+       if (mm_ops->dcache_clean_inval_poc)
+               mm_ops->dcache_clean_inval_poc(kvm_pte_follow(pte, mm_ops),
+                                              kvm_granule_size(level));
         return 0;
  }
  
diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c

index 20db2f2..4fb419f 100644 (file)
--- a/arch/arm64/kvm/hyp/vgic-v3-sr.c
+++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c
@@ -983,6 +983,9 @@ static void __vgic_v3_read_ctlr(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
         val = ((vtr >> 29) & 7) << ICC_CTLR_EL1_PRI_BITS_SHIFT;
         /* IDbits */
         val |= ((vtr >> 23) & 7) << ICC_CTLR_EL1_ID_BITS_SHIFT;
+       /* SEIS */
+       if (kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_SEIS_MASK)
+               val |= BIT(ICC_CTLR_EL1_SEIS_SHIFT);
         /* A3V */
         val |= ((vtr >> 21) & 1) << ICC_CTLR_EL1_A3V_SHIFT;
         /* EOImode */
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c

index a33d436..b549af8 100644 (file)
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -609,6 +609,18 @@ static int __init early_gicv4_enable(char *buf)
  }
  early_param("kvm-arm.vgic_v4_enable", early_gicv4_enable);
  
+static const struct midr_range broken_seis[] = {
+       MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM),
+       MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM),
+       {},
+};
+
+static bool vgic_v3_broken_seis(void)
+{
+       return ((kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_SEIS_MASK) &&
+               is_midr_in_range_list(read_cpuid_id(), broken_seis));
+}
+
  /**
   * vgic_v3_probe - probe for a VGICv3 compatible interrupt controller
   * @info:      pointer to the GIC description
@@ -676,9 +688,10 @@ int vgic_v3_probe(const struct gic_kvm_info *info)
                 group1_trap = true;
         }
  
-       if (kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_SEIS_MASK) {
-               kvm_info("GICv3 with locally generated SEI\n");
+       if (vgic_v3_broken_seis()) {
+               kvm_info("GICv3 with broken locally generated SEI\n");
  
+               kvm_vgic_global_state.ich_vtr_el2 &= ~ICH_VTR_SEIS_MASK;
                 group0_trap = true;
                 group1_trap = true;
                 if (ich_vtr_el2 & ICH_VTR_TDS_MASK)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h

index 1384517..6e7c545 100644 (file)
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1483,7 +1483,8 @@ struct kvm_x86_ops {
  
         int (*get_msr_feature)(struct kvm_msr_entry *entry);
  
-       bool (*can_emulate_instruction)(struct kvm_vcpu *vcpu, void *insn, int insn_len);
+       bool (*can_emulate_instruction)(struct kvm_vcpu *vcpu, int emul_type,
+                                       void *insn, int insn_len);
  
         bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu);
         int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu);
@@ -1496,6 +1497,7 @@ struct kvm_x86_ops {
  };
  
  struct kvm_x86_nested_ops {
+       void (*leave_nested)(struct kvm_vcpu *vcpu);
         int (*check_events)(struct kvm_vcpu *vcpu);
         bool (*hv_timer_pending)(struct kvm_vcpu *vcpu);
         void (*triple_fault)(struct kvm_vcpu *vcpu);
@@ -1861,7 +1863,6 @@ int kvm_cpu_has_extint(struct kvm_vcpu *v);
  int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
  int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
  void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
-void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu);
  
  int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
                     unsigned long ipi_bitmap_high, u32 min,
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h

index 2da3316..bf6e960 100644 (file)
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -452,6 +452,9 @@ struct kvm_sync_regs {
  
  #define KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE        0x00000001
  
+/* attributes for system fd (group 0) */
+#define KVM_X86_XCOMP_GUEST_SUPP       0
+
  struct kvm_vmx_nested_state_data {
         __u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
         __u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c

index 3902c28..28be02a 100644 (file)
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -133,6 +133,7 @@ static int kvm_cpuid_check_equal(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2
                 orig = &vcpu->arch.cpuid_entries[i];
                 if (e2[i].function != orig->function ||
                     e2[i].index != orig->index ||
+                   e2[i].flags != orig->flags ||
                     e2[i].eax != orig->eax || e2[i].ebx != orig->ebx ||
                     e2[i].ecx != orig->ecx || e2[i].edx != orig->edx)
                         return -EINVAL;
@@ -196,10 +197,26 @@ void kvm_update_pv_runtime(struct kvm_vcpu *vcpu)
                 vcpu->arch.pv_cpuid.features = best->eax;
  }
  
+/*
+ * Calculate guest's supported XCR0 taking into account guest CPUID data and
+ * supported_xcr0 (comprised of host configuration and KVM_SUPPORTED_XCR0).
+ */
+static u64 cpuid_get_supported_xcr0(struct kvm_cpuid_entry2 *entries, int nent)
+{
+       struct kvm_cpuid_entry2 *best;
+
+       best = cpuid_entry2_find(entries, nent, 0xd, 0);
+       if (!best)
+               return 0;
+
+       return (best->eax | ((u64)best->edx << 32)) & supported_xcr0;
+}
+
  static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *entries,
                                        int nent)
  {
         struct kvm_cpuid_entry2 *best;
+       u64 guest_supported_xcr0 = cpuid_get_supported_xcr0(entries, nent);
  
         best = cpuid_entry2_find(entries, nent, 1, 0);
         if (best) {
@@ -238,6 +255,21 @@ static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_e
                                            vcpu->arch.ia32_misc_enable_msr &
                                            MSR_IA32_MISC_ENABLE_MWAIT);
         }
+
+       /*
+        * Bits 127:0 of the allowed SECS.ATTRIBUTES (CPUID.0x12.0x1) enumerate
+        * the supported XSAVE Feature Request Mask (XFRM), i.e. the enclave's
+        * requested XCR0 value.  The enclave's XFRM must be a subset of XCRO
+        * at the time of EENTER, thus adjust the allowed XFRM by the guest's
+        * supported XCR0.  Similar to XCR0 handling, FP and SSE are forced to
+        * '1' even on CPUs that don't support XSAVE.
+        */
+       best = cpuid_entry2_find(entries, nent, 0x12, 0x1);
+       if (best) {
+               best->ecx &= guest_supported_xcr0 & 0xffffffff;
+               best->edx &= guest_supported_xcr0 >> 32;
+               best->ecx |= XFEATURE_MASK_FPSSE;
+       }
  }
  
  void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
@@ -261,27 +293,8 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
                 kvm_apic_set_version(vcpu);
         }
  
-       best = kvm_find_cpuid_entry(vcpu, 0xD, 0);
-       if (!best)
-               vcpu->arch.guest_supported_xcr0 = 0;
-       else
-               vcpu->arch.guest_supported_xcr0 =
-                       (best->eax | ((u64)best->edx << 32)) & supported_xcr0;
-
-       /*
-        * Bits 127:0 of the allowed SECS.ATTRIBUTES (CPUID.0x12.0x1) enumerate
-        * the supported XSAVE Feature Request Mask (XFRM), i.e. the enclave's
-        * requested XCR0 value.  The enclave's XFRM must be a subset of XCRO
-        * at the time of EENTER, thus adjust the allowed XFRM by the guest's
-        * supported XCR0.  Similar to XCR0 handling, FP and SSE are forced to
-        * '1' even on CPUs that don't support XSAVE.
-        */
-       best = kvm_find_cpuid_entry(vcpu, 0x12, 0x1);
-       if (best) {
-               best->ecx &= vcpu->arch.guest_supported_xcr0 & 0xffffffff;
-               best->edx &= vcpu->arch.guest_supported_xcr0 >> 32;
-               best->ecx |= XFEATURE_MASK_FPSSE;
-       }
+       vcpu->arch.guest_supported_xcr0 =
+               cpuid_get_supported_xcr0(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent);
  
         kvm_update_pv_runtime(vcpu);
  
@@ -346,8 +359,14 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
          * KVM_SET_CPUID{,2} again. To support this legacy behavior, check
          * whether the supplied CPUID data is equal to what's already set.
          */
-       if (vcpu->arch.last_vmentry_cpu != -1)
-               return kvm_cpuid_check_equal(vcpu, e2, nent);
+       if (vcpu->arch.last_vmentry_cpu != -1) {
+               r = kvm_cpuid_check_equal(vcpu, e2, nent);
+               if (r)
+                       return r;
+
+               kvfree(e2);
+               return 0;
+       }
  
         r = kvm_check_cpuid(vcpu, e2, nent);
         if (r)
@@ -887,13 +906,14 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
                 }
                 break;
         case 0xd: {
-               u64 guest_perm = xstate_get_guest_group_perm();
+               u64 permitted_xcr0 = supported_xcr0 & xstate_get_guest_group_perm();
+               u64 permitted_xss = supported_xss;
  
-               entry->eax &= supported_xcr0 & guest_perm;
-               entry->ebx = xstate_required_size(supported_xcr0, false);
+               entry->eax &= permitted_xcr0;
+               entry->ebx = xstate_required_size(permitted_xcr0, false);
                 entry->ecx = entry->ebx;
-               entry->edx &= (supported_xcr0 & guest_perm) >> 32;
-               if (!supported_xcr0)
+               entry->edx &= permitted_xcr0 >> 32;
+               if (!permitted_xcr0)
                         break;
  
                 entry = do_host_cpuid(array, function, 1);
@@ -902,20 +922,20 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
  
                 cpuid_entry_override(entry, CPUID_D_1_EAX);
                 if (entry->eax & (F(XSAVES)|F(XSAVEC)))
-                       entry->ebx = xstate_required_size(supported_xcr0 | supported_xss,
+                       entry->ebx = xstate_required_size(permitted_xcr0 | permitted_xss,
                                                           true);
                 else {
-                       WARN_ON_ONCE(supported_xss != 0);
+                       WARN_ON_ONCE(permitted_xss != 0);
                         entry->ebx = 0;
                 }
-               entry->ecx &= supported_xss;
-               entry->edx &= supported_xss >> 32;
+               entry->ecx &= permitted_xss;
+               entry->edx &= permitted_xss >> 32;
  
                 for (i = 2; i < 64; ++i) {
                         bool s_state;
-                       if (supported_xcr0 & BIT_ULL(i))
+                       if (permitted_xcr0 & BIT_ULL(i))
                                 s_state = false;
-                       else if (supported_xss & BIT_ULL(i))
+                       else if (permitted_xss & BIT_ULL(i))
                                 s_state = true;
                         else
                                 continue;
@@ -929,7 +949,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
                          * invalid sub-leafs.  Only valid sub-leafs should
                          * reach this point, and they should have a non-zero
                          * save state size.  Furthermore, check whether the
-                        * processor agrees with supported_xcr0/supported_xss
+                        * processor agrees with permitted_xcr0/permitted_xss
                          * on whether this is an XCR0- or IA32_XSS-managed area.
                          */
                         if (WARN_ON_ONCE(!entry->eax || (entry->ecx & 0x1) != s_state)) {
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c

index baca9fa..4662469 100644 (file)
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -2629,7 +2629,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
         kvm_apic_set_version(vcpu);
  
         apic_update_ppr(apic);
-       hrtimer_cancel(&apic->lapic_timer.timer);
+       cancel_apic_timer(apic);
         apic->lapic_timer.expired_tscdeadline = 0;
         apic_update_lvtt(apic);
         apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c

index cf20685..1218b5a 100644 (file)
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -983,9 +983,9 @@ void svm_free_nested(struct vcpu_svm *svm)
  /*
   * Forcibly leave nested mode in order to be able to reset the VCPU later on.
   */
-void svm_leave_nested(struct vcpu_svm *svm)
+void svm_leave_nested(struct kvm_vcpu *vcpu)
  {
-       struct kvm_vcpu *vcpu = &svm->vcpu;
+       struct vcpu_svm *svm = to_svm(vcpu);
  
         if (is_guest_mode(vcpu)) {
                 svm->nested.nested_run_pending = 0;
@@ -1411,7 +1411,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
                 return -EINVAL;
  
         if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) {
-               svm_leave_nested(svm);
+               svm_leave_nested(vcpu);
                 svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET));
                 return 0;
         }
@@ -1478,7 +1478,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
          */
  
         if (is_guest_mode(vcpu))
-               svm_leave_nested(svm);
+               svm_leave_nested(vcpu);
         else
                 svm->nested.vmcb02.ptr->save = svm->vmcb01.ptr->save;
  
@@ -1532,6 +1532,7 @@ static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
  }
  
  struct kvm_x86_nested_ops svm_nested_ops = {
+       .leave_nested = svm_leave_nested,
         .check_events = svm_check_nested_events,
         .triple_fault = nested_svm_triple_fault,
         .get_nested_state_pages = svm_get_nested_state_pages,
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c

index 6a22798..17b5345 100644 (file)
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -2100,8 +2100,13 @@ void __init sev_hardware_setup(void)
         if (!sev_enabled || !npt_enabled)
                 goto out;
  
-       /* Does the CPU support SEV? */
-       if (!boot_cpu_has(X86_FEATURE_SEV))
+       /*
+        * SEV must obviously be supported in hardware.  Sanity check that the
+        * CPU supports decode assists, which is mandatory for SEV guests to
+        * support instruction emulation.
+        */
+       if (!boot_cpu_has(X86_FEATURE_SEV) ||
+           WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_DECODEASSISTS)))
                 goto out;
  
         /* Retrieve SEV CPUID information */
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c

index 2c99b18..6d97629 100644 (file)
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -290,7 +290,7 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
  
         if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) {
                 if (!(efer & EFER_SVME)) {
-                       svm_leave_nested(svm);
+                       svm_leave_nested(vcpu);
                         svm_set_gif(svm, true);
                         /* #GP intercept is still needed for vmware backdoor */
                         if (!enable_vmware_backdoor)
@@ -312,7 +312,11 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
                                 return ret;
                         }
  
-                       if (svm_gp_erratum_intercept)
+                       /*
+                        * Never intercept #GP for SEV guests, KVM can't
+                        * decrypt guest memory to workaround the erratum.
+                        */
+                       if (svm_gp_erratum_intercept && !sev_guest(vcpu->kvm))
                                 set_exception_intercept(svm, GP_VECTOR);
                 }
         }
@@ -1010,9 +1014,10 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
          * Guest access to VMware backdoor ports could legitimately
          * trigger #GP because of TSS I/O permission bitmap.
          * We intercept those #GP and allow access to them anyway
-        * as VMware does.
+        * as VMware does.  Don't intercept #GP for SEV guests as KVM can't
+        * decrypt guest memory to decode the faulting instruction.
          */
-       if (enable_vmware_backdoor)
+       if (enable_vmware_backdoor && !sev_guest(vcpu->kvm))
                 set_exception_intercept(svm, GP_VECTOR);
  
         svm_set_intercept(svm, INTERCEPT_INTR);
@@ -2091,10 +2096,6 @@ static int gp_interception(struct kvm_vcpu *vcpu)
         if (error_code)
                 goto reinject;
  
-       /* All SVM instructions expect page aligned RAX */
-       if (svm->vmcb->save.rax & ~PAGE_MASK)
-               goto reinject;
-
         /* Decode the instruction for usage later */
         if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK)
                 goto reinject;
@@ -2112,8 +2113,13 @@ static int gp_interception(struct kvm_vcpu *vcpu)
                 if (!is_guest_mode(vcpu))
                         return kvm_emulate_instruction(vcpu,
                                 EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE);
-       } else
+       } else {
+               /* All SVM instructions expect page aligned RAX */
+               if (svm->vmcb->save.rax & ~PAGE_MASK)
+                       goto reinject;
+
                 return emulate_svm_instr(vcpu, opcode);
+       }
  
  reinject:
         kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
@@ -4252,79 +4258,140 @@ static void svm_enable_smi_window(struct kvm_vcpu *vcpu)
         }
  }
  
-static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len)
+static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
+                                       void *insn, int insn_len)
  {
         bool smep, smap, is_user;
         unsigned long cr4;
+       u64 error_code;
+
+       /* Emulation is always possible when KVM has access to all guest state. */
+       if (!sev_guest(vcpu->kvm))
+               return true;
+
+       /* #UD and #GP should never be intercepted for SEV guests. */
+       WARN_ON_ONCE(emul_type & (EMULTYPE_TRAP_UD |
+                                 EMULTYPE_TRAP_UD_FORCED |
+                                 EMULTYPE_VMWARE_GP));
  
         /*
-        * When the guest is an SEV-ES guest, emulation is not possible.
+        * Emulation is impossible for SEV-ES guests as KVM doesn't have access
+        * to guest register state.
          */
         if (sev_es_guest(vcpu->kvm))
                 return false;
  
+       /*
+        * Emulation is possible if the instruction is already decoded, e.g.
+        * when completing I/O after returning from userspace.
+        */
+       if (emul_type & EMULTYPE_NO_DECODE)
+               return true;
+
+       /*
+        * Emulation is possible for SEV guests if and only if a prefilled
+        * buffer containing the bytes of the intercepted instruction is
+        * available. SEV guest memory is encrypted with a guest specific key
+        * and cannot be decrypted by KVM, i.e. KVM would read cyphertext and
+        * decode garbage.
+        *
+        * Inject #UD if KVM reached this point without an instruction buffer.
+        * In practice, this path should never be hit by a well-behaved guest,
+        * e.g. KVM doesn't intercept #UD or #GP for SEV guests, but this path
+        * is still theoretically reachable, e.g. via unaccelerated fault-like
+        * AVIC access, and needs to be handled by KVM to avoid putting the
+        * guest into an infinite loop.   Injecting #UD is somewhat arbitrary,
+        * but its the least awful option given lack of insight into the guest.
+        */
+       if (unlikely(!insn)) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
+               return false;
+       }
+
+       /*
+        * Emulate for SEV guests if the insn buffer is not empty.  The buffer
+        * will be empty if the DecodeAssist microcode cannot fetch bytes for
+        * the faulting instruction because the code fetch itself faulted, e.g.
+        * the guest attempted to fetch from emulated MMIO or a guest page
+        * table used to translate CS:RIP resides in emulated MMIO.
+        */
+       if (likely(insn_len))
+               return true;
+
         /*
          * Detect and workaround Errata 1096 Fam_17h_00_0Fh.
          *
          * Errata:
-        * When CPU raise #NPF on guest data access and vCPU CR4.SMAP=1, it is
-        * possible that CPU microcode implementing DecodeAssist will fail
-        * to read bytes of instruction which caused #NPF. In this case,
-        * GuestIntrBytes field of the VMCB on a VMEXIT will incorrectly
-        * return 0 instead of the correct guest instruction bytes.
-        *
-        * This happens because CPU microcode reading instruction bytes
-        * uses a special opcode which attempts to read data using CPL=0
-        * privileges. The microcode reads CS:RIP and if it hits a SMAP
-        * fault, it gives up and returns no instruction bytes.
+        * When CPU raises #NPF on guest data access and vCPU CR4.SMAP=1, it is
+        * possible that CPU microcode implementing DecodeAssist will fail to
+        * read guest memory at CS:RIP and vmcb.GuestIntrBytes will incorrectly
+        * be '0'.  This happens because microcode reads CS:RIP using a _data_
+        * loap uop with CPL=0 privileges.  If the load hits a SMAP #PF, ucode
+        * gives up and does not fill the instruction bytes buffer.
          *
-        * Detection:
-        * We reach here in case CPU supports DecodeAssist, raised #NPF and
-        * returned 0 in GuestIntrBytes field of the VMCB.
-        * First, errata can only be triggered in case vCPU CR4.SMAP=1.
-        * Second, if vCPU CR4.SMEP=1, errata could only be triggered
-        * in case vCPU CPL==3 (Because otherwise guest would have triggered
-        * a SMEP fault instead of #NPF).
-        * Otherwise, vCPU CR4.SMEP=0, errata could be triggered by any vCPU CPL.
-        * As most guests enable SMAP if they have also enabled SMEP, use above
-        * logic in order to attempt minimize false-positive of detecting errata
-        * while still preserving all cases semantic correctness.
+        * As above, KVM reaches this point iff the VM is an SEV guest, the CPU
+        * supports DecodeAssist, a #NPF was raised, KVM's page fault handler
+        * triggered emulation (e.g. for MMIO), and the CPU returned 0 in the
+        * GuestIntrBytes field of the VMCB.
          *
-        * Workaround:
-        * To determine what instruction the guest was executing, the hypervisor
-        * will have to decode the instruction at the instruction pointer.
+        * This does _not_ mean that the erratum has been encountered, as the
+        * DecodeAssist will also fail if the load for CS:RIP hits a legitimate
+        * #PF, e.g. if the guest attempt to execute from emulated MMIO and
+        * encountered a reserved/not-present #PF.
          *
-        * In non SEV guest, hypervisor will be able to read the guest
-        * memory to decode the instruction pointer when insn_len is zero
-        * so we return true to indicate that decoding is possible.
+        * To hit the erratum, the following conditions must be true:
+        *    1. CR4.SMAP=1 (obviously).
+        *    2. CR4.SMEP=0 || CPL=3.  If SMEP=1 and CPL<3, the erratum cannot
+        *       have been hit as the guest would have encountered a SMEP
+        *       violation #PF, not a #NPF.
+        *    3. The #NPF is not due to a code fetch, in which case failure to
+        *       retrieve the instruction bytes is legitimate (see abvoe).
          *
-        * But in the SEV guest, the guest memory is encrypted with the
-        * guest specific key and hypervisor will not be able to decode the
-        * instruction pointer so we will not able to workaround it. Lets
-        * print the error and request to kill the guest.
+        * In addition, don't apply the erratum workaround if the #NPF occurred
+        * while translating guest page tables (see below).
          */
-       if (likely(!insn || insn_len))
-               return true;
-
-       /*
-        * If RIP is invalid, go ahead with emulation which will cause an
-        * internal error exit.
-        */
-       if (!kvm_vcpu_gfn_to_memslot(vcpu, kvm_rip_read(vcpu) >> PAGE_SHIFT))
-               return true;
+       error_code = to_svm(vcpu)->vmcb->control.exit_info_1;
+       if (error_code & (PFERR_GUEST_PAGE_MASK | PFERR_FETCH_MASK))
+               goto resume_guest;
  
         cr4 = kvm_read_cr4(vcpu);
         smep = cr4 & X86_CR4_SMEP;
         smap = cr4 & X86_CR4_SMAP;
         is_user = svm_get_cpl(vcpu) == 3;
         if (smap && (!smep || is_user)) {
-               if (!sev_guest(vcpu->kvm))
-                       return true;
-
                 pr_err_ratelimited("KVM: SEV Guest triggered AMD Erratum 1096\n");
-               kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+
+               /*
+                * If the fault occurred in userspace, arbitrarily inject #GP
+                * to avoid killing the guest and to hopefully avoid confusing
+                * the guest kernel too much, e.g. injecting #PF would not be
+                * coherent with respect to the guest's page tables.  Request
+                * triple fault if the fault occurred in the kernel as there's
+                * no fault that KVM can inject without confusing the guest.
+                * In practice, the triple fault is moot as no sane SEV kernel
+                * will execute from user memory while also running with SMAP=1.
+                */
+               if (is_user)
+                       kvm_inject_gp(vcpu, 0);
+               else
+                       kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
         }
  
+resume_guest:
+       /*
+        * If the erratum was not hit, simply resume the guest and let it fault
+        * again.  While awful, e.g. the vCPU may get stuck in an infinite loop
+        * if the fault is at CPL=0, it's the lesser of all evils.  Exiting to
+        * userspace will kill the guest, and letting the emulator read garbage
+        * will yield random behavior and potentially corrupt the guest.
+        *
+        * Simply resuming the guest is technically not a violation of the SEV
+        * architecture.  AMD's APM states that all code fetches and page table
+        * accesses for SEV guest are encrypted, regardless of the C-Bit.  The
+        * APM also states that encrypted accesses to MMIO are "ignored", but
+        * doesn't explicitly define "ignored", i.e. doing nothing and letting
+        * the guest spin is technically "ignoring" the access.
+        */
         return false;
  }
  
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h

index 47ef8f4..7352535 100644 (file)
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -304,11 +304,6 @@ static inline void vmcb_mark_all_clean(struct vmcb *vmcb)
                                & ~VMCB_ALWAYS_DIRTY_MASK;
  }
  
-static inline bool vmcb_is_clean(struct vmcb *vmcb, int bit)
-{
-       return (vmcb->control.clean & (1 << bit));
-}
-
  static inline void vmcb_mark_dirty(struct vmcb *vmcb, int bit)
  {
         vmcb->control.clean &= ~(1 << bit);
@@ -525,7 +520,7 @@ static inline bool nested_exit_on_nmi(struct vcpu_svm *svm)
  
  int enter_svm_guest_mode(struct kvm_vcpu *vcpu,
                          u64 vmcb_gpa, struct vmcb *vmcb12, bool from_vmrun);
-void svm_leave_nested(struct vcpu_svm *svm);
+void svm_leave_nested(struct kvm_vcpu *vcpu);
  void svm_free_nested(struct vcpu_svm *svm);
  int svm_allocate_nested(struct vcpu_svm *svm);
  int nested_svm_vmrun(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h

index c53b8bf..489ca56 100644 (file)
--- a/arch/x86/kvm/svm/svm_onhyperv.h
+++ b/arch/x86/kvm/svm/svm_onhyperv.h
@@ -46,6 +46,9 @@ static inline void svm_hv_init_vmcb(struct vmcb *vmcb)
         if (npt_enabled &&
             ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB)
                 hve->hv_enlightenments_control.enlightened_npt_tlb = 1;
+
+       if (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)
+               hve->hv_enlightenments_control.msr_bitmap = 1;
  }
  
  static inline void svm_hv_hardware_setup(void)
@@ -83,14 +86,7 @@ static inline void svm_hv_vmcb_dirty_nested_enlightenments(
         struct hv_enlightenments *hve =
                 (struct hv_enlightenments *)vmcb->control.reserved_sw;
  
-       /*
-        * vmcb can be NULL if called during early vcpu init.
-        * And its okay not to mark vmcb dirty during vcpu init
-        * as we mark it dirty unconditionally towards end of vcpu
-        * init phase.
-        */
-       if (vmcb_is_clean(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS) &&
-           hve->hv_enlightenments_control.msr_bitmap)
+       if (hve->hv_enlightenments_control.msr_bitmap)
                 vmcb_mark_dirty(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS);
  }
  
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h

index 959b59d..3f430e2 100644 (file)
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -54,7 +54,6 @@ struct nested_vmx_msrs {
  
  struct vmcs_config {
         int size;
-       int order;
         u32 basic_cap;
         u32 revision_id;
         u32 pin_based_exec_ctrl;
diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c

index ba6f99f..87e3dc1 100644 (file)
--- a/arch/x86/kvm/vmx/evmcs.c
+++ b/arch/x86/kvm/vmx/evmcs.c
@@ -12,8 +12,6 @@
  
  DEFINE_STATIC_KEY_FALSE(enable_evmcs);
  
-#if IS_ENABLED(CONFIG_HYPERV)
-
  #define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x)
  #define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \
                 {EVMCS1_OFFSET(name), clean_field}
@@ -296,6 +294,7 @@ const struct evmcs_field vmcs_field_to_evmcs_1[] = {
  };
  const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1);
  
+#if IS_ENABLED(CONFIG_HYPERV)
  __init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
  {
         vmcs_conf->pin_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_PINCTRL;
@@ -362,6 +361,7 @@ void nested_evmcs_filter_control_msr(u32 msr_index, u64 *pdata)
         case MSR_IA32_VMX_PROCBASED_CTLS2:
                 ctl_high &= ~EVMCS1_UNSUPPORTED_2NDEXEC;
                 break;
+       case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
         case MSR_IA32_VMX_PINBASED_CTLS:
                 ctl_high &= ~EVMCS1_UNSUPPORTED_PINCTRL;
                 break;
diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h

index 16731d2..8d70f9a 100644 (file)
--- a/arch/x86/kvm/vmx/evmcs.h
+++ b/arch/x86/kvm/vmx/evmcs.h
@@ -59,12 +59,12 @@ DECLARE_STATIC_KEY_FALSE(enable_evmcs);
          SECONDARY_EXEC_SHADOW_VMCS |                                   \
          SECONDARY_EXEC_TSC_SCALING |                                   \
          SECONDARY_EXEC_PAUSE_LOOP_EXITING)
-#define EVMCS1_UNSUPPORTED_VMEXIT_CTRL (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
+#define EVMCS1_UNSUPPORTED_VMEXIT_CTRL                                 \
+       (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL |                           \
+        VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
  #define EVMCS1_UNSUPPORTED_VMENTRY_CTRL (VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
  #define EVMCS1_UNSUPPORTED_VMFUNC (VMX_VMFUNC_EPTP_SWITCHING)
  
-#if IS_ENABLED(CONFIG_HYPERV)
-
  struct evmcs_field {
         u16 offset;
         u16 clean_field;
@@ -73,26 +73,56 @@ struct evmcs_field {
  extern const struct evmcs_field vmcs_field_to_evmcs_1[];
  extern const unsigned int nr_evmcs_1_fields;
  
-static __always_inline int get_evmcs_offset(unsigned long field,
-                                           u16 *clean_field)
+static __always_inline int evmcs_field_offset(unsigned long field,
+                                             u16 *clean_field)
  {
         unsigned int index = ROL16(field, 6);
         const struct evmcs_field *evmcs_field;
  
-       if (unlikely(index >= nr_evmcs_1_fields)) {
-               WARN_ONCE(1, "KVM: accessing unsupported EVMCS field %lx\n",
-                         field);
+       if (unlikely(index >= nr_evmcs_1_fields))
                 return -ENOENT;
-       }
  
         evmcs_field = &vmcs_field_to_evmcs_1[index];
  
+       /*
+        * Use offset=0 to detect holes in eVMCS. This offset belongs to
+        * 'revision_id' but this field has no encoding and is supposed to
+        * be accessed directly.
+        */
+       if (unlikely(!evmcs_field->offset))
+               return -ENOENT;
+
         if (clean_field)
                 *clean_field = evmcs_field->clean_field;
  
         return evmcs_field->offset;
  }
  
+static inline u64 evmcs_read_any(struct hv_enlightened_vmcs *evmcs,
+                                unsigned long field, u16 offset)
+{
+       /*
+        * vmcs12_read_any() doesn't care whether the supplied structure
+        * is 'struct vmcs12' or 'struct hv_enlightened_vmcs' as it takes
+        * the exact offset of the required field, use it for convenience
+        * here.
+        */
+       return vmcs12_read_any((void *)evmcs, field, offset);
+}
+
+#if IS_ENABLED(CONFIG_HYPERV)
+
+static __always_inline int get_evmcs_offset(unsigned long field,
+                                           u16 *clean_field)
+{
+       int offset = evmcs_field_offset(field, clean_field);
+
+       WARN_ONCE(offset < 0, "KVM: accessing unsupported EVMCS field %lx\n",
+                 field);
+
+       return offset;
+}
+
  static __always_inline void evmcs_write64(unsigned long field, u64 value)
  {
         u16 clean_field;
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c

index f235f77..ba34e94 100644 (file)
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -7,6 +7,7 @@
  #include <asm/mmu_context.h>
  
  #include "cpuid.h"
+#include "evmcs.h"
  #include "hyperv.h"
  #include "mmu.h"
  #include "nested.h"
@@ -4851,18 +4852,20 @@ static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu)
         struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs;
  
         /*
-        * We should allocate a shadow vmcs for vmcs01 only when L1
-        * executes VMXON and free it when L1 executes VMXOFF.
-        * As it is invalid to execute VMXON twice, we shouldn't reach
-        * here when vmcs01 already have an allocated shadow vmcs.
+        * KVM allocates a shadow VMCS only when L1 executes VMXON and frees it
+        * when L1 executes VMXOFF or the vCPU is forced out of nested
+        * operation.  VMXON faults if the CPU is already post-VMXON, so it
+        * should be impossible to already have an allocated shadow VMCS.  KVM
+        * doesn't support virtualization of VMCS shadowing, so vmcs01 should
+        * always be the loaded VMCS.
          */
-       WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs);
+       if (WARN_ON(loaded_vmcs != &vmx->vmcs01 || loaded_vmcs->shadow_vmcs))
+               return loaded_vmcs->shadow_vmcs;
+
+       loaded_vmcs->shadow_vmcs = alloc_vmcs(true);
+       if (loaded_vmcs->shadow_vmcs)
+               vmcs_clear(loaded_vmcs->shadow_vmcs);
  
-       if (!loaded_vmcs->shadow_vmcs) {
-               loaded_vmcs->shadow_vmcs = alloc_vmcs(true);
-               if (loaded_vmcs->shadow_vmcs)
-                       vmcs_clear(loaded_vmcs->shadow_vmcs);
-       }
         return loaded_vmcs->shadow_vmcs;
  }
  
@@ -5099,27 +5102,49 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
         if (!nested_vmx_check_permission(vcpu))
                 return 1;
  
-       /*
-        * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA,
-        * any VMREAD sets the ALU flags for VMfailInvalid.
-        */
-       if (vmx->nested.current_vmptr == INVALID_GPA ||
-           (is_guest_mode(vcpu) &&
-            get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA))
-               return nested_vmx_failInvalid(vcpu);
-
         /* Decode instruction info and find the field to read */
         field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf));
  
-       offset = vmcs_field_to_offset(field);
-       if (offset < 0)
-               return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+       if (!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
+               /*
+                * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA,
+                * any VMREAD sets the ALU flags for VMfailInvalid.
+                */
+               if (vmx->nested.current_vmptr == INVALID_GPA ||
+                   (is_guest_mode(vcpu) &&
+                    get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA))
+                       return nested_vmx_failInvalid(vcpu);
  
-       if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field))
-               copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
+               offset = get_vmcs12_field_offset(field);
+               if (offset < 0)
+                       return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+
+               if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field))
+                       copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
  
-       /* Read the field, zero-extended to a u64 value */
-       value = vmcs12_read_any(vmcs12, field, offset);
+               /* Read the field, zero-extended to a u64 value */
+               value = vmcs12_read_any(vmcs12, field, offset);
+       } else {
+               /*
+                * Hyper-V TLFS (as of 6.0b) explicitly states, that while an
+                * enlightened VMCS is active VMREAD/VMWRITE instructions are
+                * unsupported. Unfortunately, certain versions of Windows 11
+                * don't comply with this requirement which is not enforced in
+                * genuine Hyper-V. Allow VMREAD from an enlightened VMCS as a
+                * workaround, as misbehaving guests will panic on VM-Fail.
+                * Note, enlightened VMCS is incompatible with shadow VMCS so
+                * all VMREADs from L2 should go to L1.
+                */
+               if (WARN_ON_ONCE(is_guest_mode(vcpu)))
+                       return nested_vmx_failInvalid(vcpu);
+
+               offset = evmcs_field_offset(field, NULL);
+               if (offset < 0)
+                       return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+
+               /* Read the field, zero-extended to a u64 value */
+               value = evmcs_read_any(vmx->nested.hv_evmcs, field, offset);
+       }
  
         /*
          * Now copy part of this value to register or memory, as requested.
@@ -5214,7 +5239,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
  
         field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf));
  
-       offset = vmcs_field_to_offset(field);
+       offset = get_vmcs12_field_offset(field);
         if (offset < 0)
                 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
  
@@ -6462,7 +6487,7 @@ static u64 nested_vmx_calc_vmcs_enum_msr(void)
         max_idx = 0;
         for (i = 0; i < nr_vmcs12_fields; i++) {
                 /* The vmcs12 table is very, very sparsely populated. */
-               if (!vmcs_field_to_offset_table[i])
+               if (!vmcs12_field_offsets[i])
                         continue;
  
                 idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i));
@@ -6771,6 +6796,7 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
  }
  
  struct kvm_x86_nested_ops vmx_nested_ops = {
+       .leave_nested = vmx_leave_nested,
         .check_events = vmx_check_nested_events,
         .hv_timer_pending = nested_vmx_preemption_timer_pending,
         .triple_fault = nested_vmx_triple_fault,
diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c

index cab6ba7..2251b60 100644 (file)
--- a/arch/x86/kvm/vmx/vmcs12.c
+++ b/arch/x86/kvm/vmx/vmcs12.c
@@ -8,7 +8,7 @@
         FIELD(number, name),                                            \
         [ROL16(number##_HIGH, 6)] = VMCS12_OFFSET(name) + sizeof(u32)
  
-const unsigned short vmcs_field_to_offset_table[] = {
+const unsigned short vmcs12_field_offsets[] = {
         FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
         FIELD(POSTED_INTR_NV, posted_intr_nv),
         FIELD(GUEST_ES_SELECTOR, guest_es_selector),
@@ -151,4 +151,4 @@ const unsigned short vmcs_field_to_offset_table[] = {
         FIELD(HOST_RSP, host_rsp),
         FIELD(HOST_RIP, host_rip),
  };
-const unsigned int nr_vmcs12_fields = ARRAY_SIZE(vmcs_field_to_offset_table);
+const unsigned int nr_vmcs12_fields = ARRAY_SIZE(vmcs12_field_offsets);
diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h

index 2a45f02..746129d 100644 (file)
--- a/arch/x86/kvm/vmx/vmcs12.h
+++ b/arch/x86/kvm/vmx/vmcs12.h
@@ -361,10 +361,10 @@ static inline void vmx_check_vmcs12_offsets(void)
         CHECK_OFFSET(guest_pml_index, 996);
  }
  
-extern const unsigned short vmcs_field_to_offset_table[];
+extern const unsigned short vmcs12_field_offsets[];
  extern const unsigned int nr_vmcs12_fields;
  
-static inline short vmcs_field_to_offset(unsigned long field)
+static inline short get_vmcs12_field_offset(unsigned long field)
  {
         unsigned short offset;
         unsigned int index;
@@ -377,7 +377,7 @@ static inline short vmcs_field_to_offset(unsigned long field)
                 return -ENOENT;
  
         index = array_index_nospec(index, nr_vmcs12_fields);
-       offset = vmcs_field_to_offset_table[index];
+       offset = vmcs12_field_offsets[index];
         if (offset == 0)
                 return -ENOENT;
         return offset;
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c

index 4ac6760..aca3ae2 100644 (file)
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1487,11 +1487,12 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
         return 0;
  }
  
-static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len)
+static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
+                                       void *insn, int insn_len)
  {
         /*
          * Emulation of instructions in SGX enclaves is impossible as RIP does
-        * not point  tthe failing instruction, and even if it did, the code
+        * not point at the failing instruction, and even if it did, the code
          * stream is inaccessible.  Inject #UD instead of exiting to userspace
          * so that guest userspace can't DoS the guest simply by triggering
          * emulation (enclaves are CPL3 only).
@@ -2603,7 +2604,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
                 return -EIO;
  
         vmcs_conf->size = vmx_msr_high & 0x1fff;
-       vmcs_conf->order = get_order(vmcs_conf->size);
         vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
  
         vmcs_conf->revision_id = vmx_msr_low;
@@ -2628,7 +2628,7 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
         struct page *pages;
         struct vmcs *vmcs;
  
-       pages = __alloc_pages_node(node, flags, vmcs_config.order);
+       pages = __alloc_pages_node(node, flags, 0);
         if (!pages)
                 return NULL;
         vmcs = page_address(pages);
@@ -2647,7 +2647,7 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
  
  void free_vmcs(struct vmcs *vmcs)
  {
-       free_pages((unsigned long)vmcs, vmcs_config.order);
+       free_page((unsigned long)vmcs);
  }
  
  /*
@@ -4094,10 +4094,14 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
         vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
  
         /*
-        * If 32-bit syscall is enabled, vmx_vcpu_load_vcms rewrites
-        * HOST_IA32_SYSENTER_ESP.
+        * SYSENTER is used for 32-bit system calls on either 32-bit or
+        * 64-bit kernels.  It is always zero If neither is allowed, otherwise
+        * vmx_vcpu_load_vmcs loads it with the per-CPU entry stack (and may
+        * have already done so!).
          */
-       vmcs_writel(HOST_IA32_SYSENTER_ESP, 0);
+       if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32))
+               vmcs_writel(HOST_IA32_SYSENTER_ESP, 0);
+
         rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
         vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl);   /* 22.2.3 */
  
@@ -4901,8 +4905,33 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
                 dr6 = vmx_get_exit_qual(vcpu);
                 if (!(vcpu->guest_debug &
                       (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
+                       /*
+                        * If the #DB was due to ICEBP, a.k.a. INT1, skip the
+                        * instruction.  ICEBP generates a trap-like #DB, but
+                        * despite its interception control being tied to #DB,
+                        * is an instruction intercept, i.e. the VM-Exit occurs
+                        * on the ICEBP itself.  Note, skipping ICEBP also
+                        * clears STI and MOVSS blocking.
+                        *
+                        * For all other #DBs, set vmcs.PENDING_DBG_EXCEPTIONS.BS
+                        * if single-step is enabled in RFLAGS and STI or MOVSS
+                        * blocking is active, as the CPU doesn't set the bit
+                        * on VM-Exit due to #DB interception.  VM-Entry has a
+                        * consistency check that a single-step #DB is pending
+                        * in this scenario as the previous instruction cannot
+                        * have toggled RFLAGS.TF 0=>1 (because STI and POP/MOV
+                        * don't modify RFLAGS), therefore the one instruction
+                        * delay when activating single-step breakpoints must
+                        * have already expired.  Note, the CPU sets/clears BS
+                        * as appropriate for all other VM-Exits types.
+                        */
                         if (is_icebp(intr_info))
                                 WARN_ON(!skip_emulated_instruction(vcpu));
+                       else if ((vmx_get_rflags(vcpu) & X86_EFLAGS_TF) &&
+                                (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+                                 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)))
+                               vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
+                                           vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS) | DR6_BS);
  
                         kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
                         return 1;
@@ -5397,7 +5426,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
  {
         gpa_t gpa;
  
-       if (!vmx_can_emulate_instruction(vcpu, NULL, 0))
+       if (!vmx_can_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0))
                 return 1;
  
         /*
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c

index 9e43d75..74b53a1 100644 (file)
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3535,6 +3535,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                 if (data & ~supported_xss)
                         return 1;
                 vcpu->arch.ia32_xss = data;
+               kvm_update_cpuid_runtime(vcpu);
                 break;
         case MSR_SMI_COUNT:
                 if (!msr_info->host_initiated)
@@ -4229,6 +4230,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
         case KVM_CAP_SREGS2:
         case KVM_CAP_EXIT_ON_EMULATION_FAILURE:
         case KVM_CAP_VCPU_ATTRIBUTES:
+       case KVM_CAP_SYS_ATTRIBUTES:
                 r = 1;
                 break;
         case KVM_CAP_EXIT_HYPERCALL:
@@ -4331,7 +4333,49 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
                 break;
         }
         return r;
+}
+
+static inline void __user *kvm_get_attr_addr(struct kvm_device_attr *attr)
+{
+       void __user *uaddr = (void __user*)(unsigned long)attr->addr;
  
+       if ((u64)(unsigned long)uaddr != attr->addr)
+               return ERR_PTR(-EFAULT);
+       return uaddr;
+}
+
+static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr)
+{
+       u64 __user *uaddr = kvm_get_attr_addr(attr);
+
+       if (attr->group)
+               return -ENXIO;
+
+       if (IS_ERR(uaddr))
+               return PTR_ERR(uaddr);
+
+       switch (attr->attr) {
+       case KVM_X86_XCOMP_GUEST_SUPP:
+               if (put_user(supported_xcr0, uaddr))
+                       return -EFAULT;
+               return 0;
+       default:
+               return -ENXIO;
+               break;
+       }
+}
+
+static int kvm_x86_dev_has_attr(struct kvm_device_attr *attr)
+{
+       if (attr->group)
+               return -ENXIO;
+
+       switch (attr->attr) {
+       case KVM_X86_XCOMP_GUEST_SUPP:
+               return 0;
+       default:
+               return -ENXIO;
+       }
  }
  
  long kvm_arch_dev_ioctl(struct file *filp,
@@ -4422,6 +4466,22 @@ long kvm_arch_dev_ioctl(struct file *filp,
         case KVM_GET_SUPPORTED_HV_CPUID:
                 r = kvm_ioctl_get_supported_hv_cpuid(NULL, argp);
                 break;
+       case KVM_GET_DEVICE_ATTR: {
+               struct kvm_device_attr attr;
+               r = -EFAULT;
+               if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
+                       break;
+               r = kvm_x86_dev_get_attr(&attr);
+               break;
+       }
+       case KVM_HAS_DEVICE_ATTR: {
+               struct kvm_device_attr attr;
+               r = -EFAULT;
+               if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
+                       break;
+               r = kvm_x86_dev_has_attr(&attr);
+               break;
+       }
         default:
                 r = -EINVAL;
                 break;
@@ -4860,8 +4920,10 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
                 vcpu->arch.apic->sipi_vector = events->sipi_vector;
  
         if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
-               if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm)
+               if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) {
+                       kvm_x86_ops.nested_ops->leave_nested(vcpu);
                         kvm_smm_changed(vcpu, events->smi.smm);
+               }
  
                 vcpu->arch.smi_pending = events->smi.pending;
  
@@ -5022,11 +5084,11 @@ static int kvm_arch_tsc_has_attr(struct kvm_vcpu *vcpu,
  static int kvm_arch_tsc_get_attr(struct kvm_vcpu *vcpu,
                                  struct kvm_device_attr *attr)
  {
-       u64 __user *uaddr = (u64 __user *)(unsigned long)attr->addr;
+       u64 __user *uaddr = kvm_get_attr_addr(attr);
         int r;
  
-       if ((u64)(unsigned long)uaddr != attr->addr)
-               return -EFAULT;
+       if (IS_ERR(uaddr))
+               return PTR_ERR(uaddr);
  
         switch (attr->attr) {
         case KVM_VCPU_TSC_OFFSET:
@@ -5045,12 +5107,12 @@ static int kvm_arch_tsc_get_attr(struct kvm_vcpu *vcpu,
  static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu,
                                  struct kvm_device_attr *attr)
  {
-       u64 __user *uaddr = (u64 __user *)(unsigned long)attr->addr;
+       u64 __user *uaddr = kvm_get_attr_addr(attr);
         struct kvm *kvm = vcpu->kvm;
         int r;
  
-       if ((u64)(unsigned long)uaddr != attr->addr)
-               return -EFAULT;
+       if (IS_ERR(uaddr))
+               return PTR_ERR(uaddr);
  
         switch (attr->attr) {
         case KVM_VCPU_TSC_OFFSET: {
@@ -6810,6 +6872,13 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
  }
  EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
  
+static int kvm_can_emulate_insn(struct kvm_vcpu *vcpu, int emul_type,
+                               void *insn, int insn_len)
+{
+       return static_call(kvm_x86_can_emulate_instruction)(vcpu, emul_type,
+                                                           insn, insn_len);
+}
+
  int handle_ud(struct kvm_vcpu *vcpu)
  {
         static const char kvm_emulate_prefix[] = { __KVM_EMULATE_PREFIX };
@@ -6817,7 +6886,7 @@ int handle_ud(struct kvm_vcpu *vcpu)
         char sig[5]; /* ud2; .ascii "kvm" */
         struct x86_exception e;
  
-       if (unlikely(!static_call(kvm_x86_can_emulate_instruction)(vcpu, NULL, 0)))
+       if (unlikely(!kvm_can_emulate_insn(vcpu, emul_type, NULL, 0)))
                 return 1;
  
         if (force_emulation_prefix &&
@@ -8193,7 +8262,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
         bool writeback = true;
         bool write_fault_to_spt;
  
-       if (unlikely(!static_call(kvm_x86_can_emulate_instruction)(vcpu, insn, insn_len)))
+       if (unlikely(!kvm_can_emulate_insn(vcpu, emulation_type, insn, insn_len)))
                 return 1;
  
         vcpu->arch.l1tf_flush_l1d = true;
@@ -9706,7 +9775,7 @@ void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
                 kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
  }
  
-void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
+static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
  {
         if (!lapic_in_kernel(vcpu))
                 return;
@@ -11209,7 +11278,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
  
                 vcpu->arch.msr_misc_features_enables = 0;
  
-               vcpu->arch.xcr0 = XFEATURE_MASK_FP;
+               __kvm_set_xcr(vcpu, 0, XFEATURE_MASK_FP);
+               __kvm_set_msr(vcpu, MSR_IA32_XSS, 0, true);
         }
  
         /* All GPRs except RDX (handled below) are zeroed on RESET/INIT. */
@@ -11226,8 +11296,6 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
         cpuid_0x1 = kvm_find_cpuid_entry(vcpu, 1, 0);
         kvm_rdx_write(vcpu, cpuid_0x1 ? cpuid_0x1->eax : 0x600);
  
-       vcpu->arch.ia32_xss = 0;
-
         static_call(kvm_x86_vcpu_reset)(vcpu, init_event);
  
         kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c

index 0e3f7d6..bad5753 100644 (file)
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -316,10 +316,7 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
                                      "\tnotq %0\n"
                                      "\t" LOCK_PREFIX "andq %0, %2\n"
                                      "2:\n"
-                                    "\t.section .fixup,\"ax\"\n"
-                                    "3:\tjmp\t2b\n"
-                                    "\t.previous\n"
-                                    _ASM_EXTABLE_UA(1b, 3b)
+                                    _ASM_EXTABLE_UA(1b, 2b)
                                      : "=r" (evtchn_pending_sel),
                                        "+m" (vi->evtchn_pending_sel),
                                        "+m" (v->arch.xen.evtchn_pending_sel)
@@ -335,10 +332,7 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
                                      "\tnotl %0\n"
                                      "\t" LOCK_PREFIX "andl %0, %2\n"
                                      "2:\n"
-                                    "\t.section .fixup,\"ax\"\n"
-                                    "3:\tjmp\t2b\n"
-                                    "\t.previous\n"
-                                    _ASM_EXTABLE_UA(1b, 3b)
+                                    _ASM_EXTABLE_UA(1b, 2b)
                                      : "=r" (evtchn_pending_sel32),
                                        "+m" (vi->evtchn_pending_sel),
                                        "+m" (v->arch.xen.evtchn_pending_sel)
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h

index 9563d29..b46bcdb 100644 (file)
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1133,6 +1133,7 @@ struct kvm_ppc_resize_hpt {
  #define KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM 206
  #define KVM_CAP_VM_GPA_BITS 207
  #define KVM_CAP_XSAVE2 208
+#define KVM_CAP_SYS_ATTRIBUTES 209
  
  #ifdef KVM_CAP_IRQ_ROUTING
  
diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h

index 2da3316..bf6e960 100644 (file)
--- a/tools/arch/x86/include/uapi/asm/kvm.h
+++ b/tools/arch/x86/include/uapi/asm/kvm.h
@@ -452,6 +452,9 @@ struct kvm_sync_regs {
  
  #define KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE        0x00000001
  
+/* attributes for system fd (group 0) */
+#define KVM_X86_XCOMP_GUEST_SUPP       0
+
  struct kvm_vmx_nested_state_data {
         __u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
         __u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h

index 9563d29..b46bcdb 100644 (file)
--- a/tools/include/uapi/linux/kvm.h
+++ b/tools/include/uapi/linux/kvm.h
@@ -1133,6 +1133,7 @@ struct kvm_ppc_resize_hpt {
  #define KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM 206
  #define KVM_CAP_VM_GPA_BITS 207
  #define KVM_CAP_XSAVE2 208
+#define KVM_CAP_SYS_ATTRIBUTES 209
  
  #ifdef KVM_CAP_IRQ_ROUTING
  
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile

index 81ebf99..0e4926b 100644 (file)
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -85,6 +85,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/xen_vmcall_test
  TEST_GEN_PROGS_x86_64 += x86_64/vmx_pi_mmio_test
  TEST_GEN_PROGS_x86_64 += x86_64/sev_migrate_tests
  TEST_GEN_PROGS_x86_64 += x86_64/amx_test
+TEST_GEN_PROGS_x86_64 += access_tracking_perf_test
  TEST_GEN_PROGS_x86_64 += demand_paging_test
  TEST_GEN_PROGS_x86_64 += dirty_log_test
  TEST_GEN_PROGS_x86_64 += dirty_log_perf_test
diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h

index 66775de..4ed6aa0 100644 (file)
--- a/tools/testing/selftests/kvm/include/kvm_util_base.h
+++ b/tools/testing/selftests/kvm/include/kvm_util_base.h
@@ -345,7 +345,6 @@ struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus,
   *   guest_code - The vCPU's entry point
   */
  void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code);
-void vm_xsave_req_perm(void);
  
  bool vm_is_unrestricted_guest(struct kvm_vm *vm);
  
diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h

index 423d8a6..8a470da 100644 (file)
--- a/tools/testing/selftests/kvm/include/x86_64/processor.h
+++ b/tools/testing/selftests/kvm/include/x86_64/processor.h
@@ -458,6 +458,7 @@ uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2,
  struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void);
  void vcpu_set_hv_cpuid(struct kvm_vm *vm, uint32_t vcpuid);
  struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vm *vm, uint32_t vcpuid);
+void vm_xsave_req_perm(int bit);
  
  enum x86_page_size {
         X86_PAGE_SIZE_4K = 0,
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c

index 8c53f96..d8cf851 100644 (file)
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -393,13 +393,6 @@ struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus,
         struct kvm_vm *vm;
         int i;
  
-#ifdef __x86_64__
-       /*
-        * Permission needs to be requested before KVM_SET_CPUID2.
-        */
-       vm_xsave_req_perm();
-#endif
-
         /* Force slot0 memory size not small than DEFAULT_GUEST_PHY_PAGES */
         if (slot0_mem_pages < DEFAULT_GUEST_PHY_PAGES)
                 slot0_mem_pages = DEFAULT_GUEST_PHY_PAGES;
diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c

index 5f9d7e9..9f000df 100644 (file)
--- a/tools/testing/selftests/kvm/lib/x86_64/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c
@@ -665,16 +665,31 @@ static bool is_xfd_supported(void)
         return !!(eax & CPUID_XFD_BIT);
  }
  
-void vm_xsave_req_perm(void)
+void vm_xsave_req_perm(int bit)
  {
-       unsigned long bitmask;
+       int kvm_fd;
+       u64 bitmask;
         long rc;
+       struct kvm_device_attr attr = {
+               .group = 0,
+               .attr = KVM_X86_XCOMP_GUEST_SUPP,
+               .addr = (unsigned long) &bitmask
+       };
+
+       kvm_fd = open_kvm_dev_path_or_exit();
+       rc = ioctl(kvm_fd, KVM_GET_DEVICE_ATTR, &attr);
+       close(kvm_fd);
+       if (rc == -1 && (errno == ENXIO || errno == EINVAL))
+               exit(KSFT_SKIP);
+       TEST_ASSERT(rc == 0, "KVM_GET_DEVICE_ATTR(0, KVM_X86_XCOMP_GUEST_SUPP) error: %ld", rc);
+       if (!(bitmask & (1ULL << bit)))
+               exit(KSFT_SKIP);
  
         if (!is_xfd_supported())
-               return;
+               exit(KSFT_SKIP);
+
+       rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_GUEST_PERM, bit);
  
-       rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_GUEST_PERM,
-                    XSTATE_XTILE_DATA_BIT);
         /*
          * The older kernel version(<5.15) can't support
          * ARCH_REQ_XCOMP_GUEST_PERM and directly return.
@@ -684,7 +699,7 @@ void vm_xsave_req_perm(void)
  
         rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &bitmask);
         TEST_ASSERT(rc == 0, "prctl(ARCH_GET_XCOMP_GUEST_PERM) error: %ld", rc);
-       TEST_ASSERT(bitmask & XFEATURE_XTILE_MASK,
+       TEST_ASSERT(bitmask & (1ULL << bit),
                     "prctl(ARCH_REQ_XCOMP_GUEST_PERM) failure bitmask=0x%lx",
                     bitmask);
  }
diff --git a/tools/testing/selftests/kvm/x86_64/amx_test.c b/tools/testing/selftests/kvm/x86_64/amx_test.c

index 523c1e9..52a3ef6 100644 (file)
--- a/tools/testing/selftests/kvm/x86_64/amx_test.c
+++ b/tools/testing/selftests/kvm/x86_64/amx_test.c
@@ -329,6 +329,8 @@ int main(int argc, char *argv[])
         u32 amx_offset;
         int stage, ret;
  
+       vm_xsave_req_perm(XSTATE_XTILE_DATA_BIT);
+
         /* Create VM */
         vm = vm_create_default(VCPU_ID, 0, guest_code);
  
diff --git a/tools/testing/selftests/kvm/x86_64/smm_test.c b/tools/testing/selftests/kvm/x86_64/smm_test.c

index 2da8eb8..a626d40 100644 (file)
--- a/tools/testing/selftests/kvm/x86_64/smm_test.c
+++ b/tools/testing/selftests/kvm/x86_64/smm_test.c
@@ -105,7 +105,6 @@ static void guest_code(void *arg)
  
                 if (cpu_has_svm()) {
                         run_guest(svm->vmcb, svm->vmcb_gpa);
-                       svm->vmcb->save.rip += 3;
                         run_guest(svm->vmcb, svm->vmcb_gpa);
                 } else {
                         vmlaunch();
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c

index 2ad013b..59b1dd4 100644 (file)
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -463,8 +463,8 @@ bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin)
         idx = srcu_read_lock(&kvm->irq_srcu);
         gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
         if (gsi != -1)
-               hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list,
-                                        link)
+               hlist_for_each_entry_srcu(kian, &kvm->irq_ack_notifier_list,
+                                         link, srcu_read_lock_held(&kvm->irq_srcu))
                         if (kian->gsi == gsi) {
                                 srcu_read_unlock(&kvm->irq_srcu, idx);
                                 return true;
@@ -480,8 +480,8 @@ void kvm_notify_acked_gsi(struct kvm *kvm, int gsi)
  {
         struct kvm_irq_ack_notifier *kian;
  
-       hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list,
-                                link)
+       hlist_for_each_entry_srcu(kian, &kvm->irq_ack_notifier_list,
+                                 link, srcu_read_lock_held(&kvm->irq_srcu))
                 if (kian->gsi == gsi)
                         kian->irq_acked(kian);
  }
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c

index 9a20f22..58d31da 100644 (file)
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2248,7 +2248,6 @@ struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn
  
         return NULL;
  }
-EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_memslot);
  
  bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
  {
@@ -2463,9 +2462,8 @@ static int kvm_try_get_pfn(kvm_pfn_t pfn)
  }
  
  static int hva_to_pfn_remapped(struct vm_area_struct *vma,
-                              unsigned long addr, bool *async,
-                              bool write_fault, bool *writable,
-                              kvm_pfn_t *p_pfn)
+                              unsigned long addr, bool write_fault,
+                              bool *writable, kvm_pfn_t *p_pfn)
  {
         kvm_pfn_t pfn;
         pte_t *ptep;
@@ -2575,7 +2573,7 @@ retry:
         if (vma == NULL)
                 pfn = KVM_PFN_ERR_FAULT;
         else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
-               r = hva_to_pfn_remapped(vma, addr, async, write_fault, writable, &pfn);
+               r = hva_to_pfn_remapped(vma, addr, write_fault, writable, &pfn);
                 if (r == -EAGAIN)
                         goto retry;
                 if (r < 0)
author	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 28 Jan 2022 17:00:26 +0000 (19:00 +0200)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 28 Jan 2022 17:00:26 +0000 (19:00 +0200)
Documentation/virt/kvm/api.rst		patch \| blob \| history
arch/arm64/kvm/hyp/exception.c		patch \| blob \| history
arch/arm64/kvm/hyp/pgtable.c		patch \| blob \| history
arch/arm64/kvm/hyp/vgic-v3-sr.c		patch \| blob \| history
arch/arm64/kvm/vgic/vgic-v3.c		patch \| blob \| history
arch/x86/include/asm/kvm_host.h		patch \| blob \| history
arch/x86/include/uapi/asm/kvm.h		patch \| blob \| history
arch/x86/kvm/cpuid.c		patch \| blob \| history
arch/x86/kvm/lapic.c		patch \| blob \| history
arch/x86/kvm/svm/nested.c		patch \| blob \| history
arch/x86/kvm/svm/sev.c		patch \| blob \| history
arch/x86/kvm/svm/svm.c		patch \| blob \| history
arch/x86/kvm/svm/svm.h		patch \| blob \| history
arch/x86/kvm/svm/svm_onhyperv.h		patch \| blob \| history
arch/x86/kvm/vmx/capabilities.h		patch \| blob \| history
arch/x86/kvm/vmx/evmcs.c		patch \| blob \| history
arch/x86/kvm/vmx/evmcs.h		patch \| blob \| history
arch/x86/kvm/vmx/nested.c		patch \| blob \| history
arch/x86/kvm/vmx/vmcs12.c		patch \| blob \| history
arch/x86/kvm/vmx/vmcs12.h		patch \| blob \| history
arch/x86/kvm/vmx/vmx.c		patch \| blob \| history
arch/x86/kvm/x86.c		patch \| blob \| history
arch/x86/kvm/xen.c		patch \| blob \| history
include/uapi/linux/kvm.h		patch \| blob \| history
tools/arch/x86/include/uapi/asm/kvm.h		patch \| blob \| history
tools/include/uapi/linux/kvm.h		patch \| blob \| history
tools/testing/selftests/kvm/Makefile		patch \| blob \| history
tools/testing/selftests/kvm/include/kvm_util_base.h		patch \| blob \| history
tools/testing/selftests/kvm/include/x86_64/processor.h		patch \| blob \| history
tools/testing/selftests/kvm/lib/kvm_util.c		patch \| blob \| history
tools/testing/selftests/kvm/lib/x86_64/processor.c		patch \| blob \| history
tools/testing/selftests/kvm/x86_64/amx_test.c		patch \| blob \| history
tools/testing/selftests/kvm/x86_64/smm_test.c		patch \| blob \| history
virt/kvm/eventfd.c		patch \| blob \| history
virt/kvm/kvm_main.c		patch \| blob \| history