Merge tag 'kvmarm-5.12' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmar...
authorPaolo Bonzini <pbonzini@redhat.com>
Fri, 12 Feb 2021 16:23:44 +0000 (11:23 -0500)
committerPaolo Bonzini <pbonzini@redhat.com>
Fri, 12 Feb 2021 16:23:44 +0000 (11:23 -0500)
KVM/arm64 updates for Linux 5.12

- Make the nVHE EL2 object relocatable, resulting in much more
  maintainable code
- Handle concurrent translation faults hitting the same page
  in a more elegant way
- Support for the standard TRNG hypervisor call
- A bunch of small PMU/Debug fixes
- Allow the disabling of symbol export from assembly code
- Simplification of the early init hypercall handling

124 files changed:
Documentation/virt/kvm/amd-memory-encryption.rst
Documentation/virt/kvm/api.rst
Documentation/virt/kvm/locking.rst
Documentation/virt/kvm/nested-vmx.rst
Documentation/virt/kvm/running-nested-guests.rst
arch/arm64/include/asm/kvm_host.h
arch/arm64/include/asm/spinlock.h
arch/arm64/kvm/hyp/nvhe/hyp-init.S
arch/mips/include/asm/kvm_host.h
arch/mips/include/asm/spinlock.h
arch/powerpc/include/asm/hvcall.h
arch/powerpc/include/asm/kvm_book3s_asm.h
arch/powerpc/include/asm/kvm_host.h
arch/powerpc/include/asm/kvm_ppc.h
arch/powerpc/include/uapi/asm/kvm.h
arch/powerpc/kernel/asm-offsets.c
arch/powerpc/kvm/book3s_hv.c
arch/powerpc/kvm/book3s_hv_builtin.c
arch/powerpc/kvm/book3s_hv_nested.c
arch/powerpc/kvm/book3s_hv_rmhandlers.S
arch/powerpc/kvm/booke.c
arch/powerpc/kvm/powerpc.c
arch/s390/include/asm/kvm_host.h
arch/s390/pci/pci_mmio.c
arch/sparc/include/asm/spinlock_64.h
arch/x86/include/asm/cpufeatures.h
arch/x86/include/asm/kvm-x86-ops.h [new file with mode: 0644]
arch/x86/include/asm/kvm_host.h
arch/x86/include/asm/virtext.h
arch/x86/include/asm/vmx.h
arch/x86/include/asm/vmxfeatures.h
arch/x86/include/asm/xen/interface.h
arch/x86/include/uapi/asm/kvm.h
arch/x86/include/uapi/asm/vmx.h
arch/x86/kernel/apic/apic.c
arch/x86/kernel/reboot.c
arch/x86/kvm/Makefile
arch/x86/kvm/cpuid.c
arch/x86/kvm/cpuid.h
arch/x86/kvm/emulate.c
arch/x86/kvm/hyperv.c
arch/x86/kvm/hyperv.h
arch/x86/kvm/irq.c
arch/x86/kvm/kvm_cache_regs.h
arch/x86/kvm/kvm_emulate.h
arch/x86/kvm/lapic.c
arch/x86/kvm/lapic.h
arch/x86/kvm/mmu.h
arch/x86/kvm/mmu/mmu.c
arch/x86/kvm/mmu/mmu_audit.c
arch/x86/kvm/mmu/mmu_internal.h
arch/x86/kvm/mmu/page_track.c
arch/x86/kvm/mmu/paging_tmpl.h
arch/x86/kvm/mmu/spte.c
arch/x86/kvm/mmu/spte.h
arch/x86/kvm/mmu/tdp_iter.c
arch/x86/kvm/mmu/tdp_iter.h
arch/x86/kvm/mmu/tdp_mmu.c
arch/x86/kvm/mmu/tdp_mmu.h
arch/x86/kvm/mtrr.c
arch/x86/kvm/pmu.c
arch/x86/kvm/pmu.h
arch/x86/kvm/svm/avic.c
arch/x86/kvm/svm/nested.c
arch/x86/kvm/svm/sev.c
arch/x86/kvm/svm/svm.c
arch/x86/kvm/svm/svm.h
arch/x86/kvm/svm/svm_ops.h [new file with mode: 0644]
arch/x86/kvm/trace.h
arch/x86/kvm/vmx/capabilities.h
arch/x86/kvm/vmx/nested.c
arch/x86/kvm/vmx/pmu_intel.c
arch/x86/kvm/vmx/posted_intr.c
arch/x86/kvm/vmx/vmenter.S
arch/x86/kvm/vmx/vmx.c
arch/x86/kvm/vmx/vmx.h
arch/x86/kvm/x86.c
arch/x86/kvm/x86.h
arch/x86/kvm/xen.c [new file with mode: 0644]
arch/x86/kvm/xen.h [new file with mode: 0644]
arch/x86/mm/mem_encrypt.c
arch/xtensa/include/asm/spinlock.h
drivers/crypto/ccp/sev-dev.c
drivers/gpu/drm/i915/gvt/kvmgt.c
fs/dax.c
include/asm-generic/qrwlock.h
include/linux/kvm_host.h
include/linux/mm.h
include/linux/psp-sev.h
include/linux/rwlock.h
include/linux/sched.h
include/uapi/linux/kvm.h
include/xen/interface/xen.h
kernel/locking/qrwlock.c
kernel/sched/core.c
mm/memory.c
tools/arch/powerpc/include/uapi/asm/kvm.h
tools/include/uapi/linux/kvm.h
tools/testing/selftests/kvm/.gitignore
tools/testing/selftests/kvm/Makefile
tools/testing/selftests/kvm/demand_paging_test.c
tools/testing/selftests/kvm/dirty_log_perf_test.c
tools/testing/selftests/kvm/include/kvm_util.h
tools/testing/selftests/kvm/include/numaif.h [new file with mode: 0644]
tools/testing/selftests/kvm/include/perf_test_util.h
tools/testing/selftests/kvm/include/test_util.h
tools/testing/selftests/kvm/include/x86_64/processor.h
tools/testing/selftests/kvm/lib/kvm_util.c
tools/testing/selftests/kvm/lib/perf_test_util.c
tools/testing/selftests/kvm/lib/test_util.c
tools/testing/selftests/kvm/lib/x86_64/processor.c
tools/testing/selftests/kvm/lib/x86_64/svm.c
tools/testing/selftests/kvm/memslot_modification_stress_test.c [new file with mode: 0644]
tools/testing/selftests/kvm/settings [new file with mode: 0644]
tools/testing/selftests/kvm/x86_64/evmcs_test.c
tools/testing/selftests/kvm/x86_64/get_cpuid_test.c [new file with mode: 0644]
tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c
tools/testing/selftests/kvm/x86_64/vmx_pmu_msrs_test.c [new file with mode: 0644]
tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c [new file with mode: 0644]
tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c [new file with mode: 0644]
tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c [new file with mode: 0644]
virt/kvm/dirty_ring.c
virt/kvm/kvm_main.c
virt/kvm/mmu_lock.h [new file with mode: 0644]

index 09a8f2a..469a630 100644 (file)
@@ -263,6 +263,27 @@ Returns: 0 on success, -negative on error
                 __u32 trans_len;
         };
 
+10. KVM_SEV_GET_ATTESTATION_REPORT
+----------------------------------
+
+The KVM_SEV_GET_ATTESTATION_REPORT command can be used by the hypervisor to query the attestation
+report containing the SHA-256 digest of the guest memory and VMSA passed through the KVM_SEV_LAUNCH
+commands and signed with the PEK. The digest returned by the command should match the digest
+used by the guest owner with the KVM_SEV_LAUNCH_MEASURE.
+
+Parameters (in): struct kvm_sev_attestation
+
+Returns: 0 on success, -negative on error
+
+::
+
+        struct kvm_sev_attestation_report {
+                __u8 mnonce[16];        /* A random mnonce that will be placed in the report */
+
+                __u64 uaddr;            /* userspace address where the report should be copied */
+                __u32 len;
+        };
+
 References
 ==========
 
index a9bf7f2..45fd862 100644 (file)
@@ -360,10 +360,9 @@ since the last call to this ioctl.  Bit 0 is the first page in the
 memory slot.  Ensure the entire structure is cleared to avoid padding
 issues.
 
-If KVM_CAP_MULTI_ADDRESS_SPACE is available, bits 16-31 specifies
-the address space for which you want to return the dirty bitmap.
-They must be less than the value that KVM_CHECK_EXTENSION returns for
-the KVM_CAP_MULTI_ADDRESS_SPACE capability.
+If KVM_CAP_MULTI_ADDRESS_SPACE is available, bits 16-31 of slot field specifies
+the address space for which you want to return the dirty bitmap.  See
+KVM_SET_USER_MEMORY_REGION for details on the usage of slot field.
 
 The bits in the dirty bitmap are cleared before the ioctl returns, unless
 KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is enabled.  For more information,
@@ -961,6 +960,14 @@ memory.
        __u8 pad2[30];
   };
 
+If the KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL flag is returned from the
+KVM_CAP_XEN_HVM check, it may be set in the flags field of this ioctl.
+This requests KVM to generate the contents of the hypercall page
+automatically; hypercalls will be intercepted and passed to userspace
+through KVM_EXIT_XEN.  In this case, all of the blob size and address
+fields must be zero.
+
+No other flags are currently valid in the struct kvm_xen_hvm_config.
 
 4.29 KVM_GET_CLOCK
 ------------------
@@ -1336,7 +1343,7 @@ documentation when it pops into existence).
 
 :Capability: KVM_CAP_ENABLE_CAP_VM
 :Architectures: all
-:Type: vcpu ioctl
+:Type: vm ioctl
 :Parameters: struct kvm_enable_cap (in)
 :Returns: 0 on success; -1 on error
 
@@ -2269,6 +2276,8 @@ registers, find a list below:
   PPC     KVM_REG_PPC_PSSCR               64
   PPC     KVM_REG_PPC_DEC_EXPIRY          64
   PPC     KVM_REG_PPC_PTCR                64
+  PPC     KVM_REG_PPC_DAWR1               64
+  PPC     KVM_REG_PPC_DAWRX1              64
   PPC     KVM_REG_PPC_TM_GPR0             64
   ...
   PPC     KVM_REG_PPC_TM_GPR31            64
@@ -4435,7 +4444,7 @@ to I/O ports.
 :Capability: KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2
 :Architectures: x86, arm, arm64, mips
 :Type: vm ioctl
-:Parameters: struct kvm_dirty_log (in)
+:Parameters: struct kvm_clear_dirty_log (in)
 :Returns: 0 on success, -1 on error
 
 ::
@@ -4462,10 +4471,9 @@ in KVM's dirty bitmap, and dirty tracking is re-enabled for that page
 (for example via write-protection, or by clearing the dirty bit in
 a page table entry).
 
-If KVM_CAP_MULTI_ADDRESS_SPACE is available, bits 16-31 specifies
-the address space for which you want to return the dirty bitmap.
-They must be less than the value that KVM_CHECK_EXTENSION returns for
-the KVM_CAP_MULTI_ADDRESS_SPACE capability.
+If KVM_CAP_MULTI_ADDRESS_SPACE is available, bits 16-31 of slot field specifies
+the address space for which you want to clear the dirty status.  See
+KVM_SET_USER_MEMORY_REGION for details on the usage of slot field.
 
 This ioctl is mostly useful when KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2
 is enabled; for more information, see the description of the capability.
@@ -4833,6 +4841,101 @@ into user space.
 If a vCPU is in running state while this ioctl is invoked, the vCPU may
 experience inconsistent filtering behavior on MSR accesses.
 
+4.127 KVM_XEN_HVM_SET_ATTR
+--------------------------
+
+:Capability: KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO
+:Architectures: x86
+:Type: vm ioctl
+:Parameters: struct kvm_xen_hvm_attr
+:Returns: 0 on success, < 0 on error
+
+::
+
+  struct kvm_xen_hvm_attr {
+       __u16 type;
+       __u16 pad[3];
+       union {
+               __u8 long_mode;
+               __u8 vector;
+               struct {
+                       __u64 gfn;
+               } shared_info;
+               __u64 pad[4];
+       } u;
+  };
+
+type values:
+
+KVM_XEN_ATTR_TYPE_LONG_MODE
+  Sets the ABI mode of the VM to 32-bit or 64-bit (long mode). This
+  determines the layout of the shared info pages exposed to the VM.
+
+KVM_XEN_ATTR_TYPE_SHARED_INFO
+  Sets the guest physical frame number at which the Xen "shared info"
+  page resides. Note that although Xen places vcpu_info for the first
+  32 vCPUs in the shared_info page, KVM does not automatically do so
+  and instead requires that KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO be used
+  explicitly even when the vcpu_info for a given vCPU resides at the
+  "default" location in the shared_info page. This is because KVM is
+  not aware of the Xen CPU id which is used as the index into the
+  vcpu_info[] array, so cannot know the correct default location.
+
+KVM_XEN_ATTR_TYPE_UPCALL_VECTOR
+  Sets the exception vector used to deliver Xen event channel upcalls.
+
+4.128 KVM_XEN_HVM_GET_ATTR
+--------------------------
+
+:Capability: KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO
+:Architectures: x86
+:Type: vm ioctl
+:Parameters: struct kvm_xen_hvm_attr
+:Returns: 0 on success, < 0 on error
+
+Allows Xen VM attributes to be read. For the structure and types,
+see KVM_XEN_HVM_SET_ATTR above.
+
+4.129 KVM_XEN_VCPU_SET_ATTR
+---------------------------
+
+:Capability: KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO
+:Architectures: x86
+:Type: vcpu ioctl
+:Parameters: struct kvm_xen_vcpu_attr
+:Returns: 0 on success, < 0 on error
+
+::
+
+  struct kvm_xen_vcpu_attr {
+       __u16 type;
+       __u16 pad[3];
+       union {
+               __u64 gpa;
+               __u64 pad[4];
+       } u;
+  };
+
+type values:
+
+KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO
+  Sets the guest physical address of the vcpu_info for a given vCPU.
+
+KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO
+  Sets the guest physical address of an additional pvclock structure
+  for a given vCPU. This is typically used for guest vsyscall support.
+
+4.130 KVM_XEN_VCPU_GET_ATTR
+---------------------------
+
+:Capability: KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO
+:Architectures: x86
+:Type: vcpu ioctl
+:Parameters: struct kvm_xen_vcpu_attr
+:Returns: 0 on success, < 0 on error
+
+Allows Xen vCPU attributes to be read. For the structure and types,
+see KVM_XEN_VCPU_SET_ATTR above.
 
 5. The kvm_run structure
 ========================
@@ -4895,9 +4998,11 @@ local APIC is not used.
        __u16 flags;
 
 More architecture-specific flags detailing state of the VCPU that may
-affect the device's behavior.  The only currently defined flag is
-KVM_RUN_X86_SMM, which is valid on x86 machines and is set if the
-VCPU is in system management mode.
+affect the device's behavior. Current defined flags:
+  /* x86, set if the VCPU is in system management mode */
+  #define KVM_RUN_X86_SMM     (1 << 0)
+  /* x86, set if bus lock detected in VM */
+  #define KVM_RUN_BUS_LOCK    (1 << 1)
 
 ::
 
@@ -4998,13 +5103,18 @@ to the byte array.
 
 .. note::
 
-      For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR,
+      For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN,
       KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding
       operations are complete (and guest state is consistent) only after userspace
       has re-entered the kernel with KVM_RUN.  The kernel side will first finish
-      incomplete operations and then check for pending signals.  Userspace
-      can re-enter the guest with an unmasked signal pending to complete
-      pending operations.
+      incomplete operations and then check for pending signals.
+
+      The pending state of the operation is not preserved in state which is
+      visible to userspace, thus userspace should ensure that the operation is
+      completed before performing a live migration.  Userspace can re-enter the
+      guest with an unmasked signal pending or with the immediate_exit field set
+      to complete pending operations without allowing any further instructions
+      to be executed.
 
 ::
 
@@ -5329,6 +5439,34 @@ wants to write. Once finished processing the event, user space must continue
 vCPU execution. If the MSR write was unsuccessful, user space also sets the
 "error" field to "1".
 
+::
+
+
+               struct kvm_xen_exit {
+  #define KVM_EXIT_XEN_HCALL          1
+                       __u32 type;
+                       union {
+                               struct {
+                                       __u32 longmode;
+                                       __u32 cpl;
+                                       __u64 input;
+                                       __u64 result;
+                                       __u64 params[6];
+                               } hcall;
+                       } u;
+               };
+               /* KVM_EXIT_XEN */
+                struct kvm_hyperv_exit xen;
+
+Indicates that the VCPU exits into userspace to process some tasks
+related to Xen emulation.
+
+Valid values for 'type' are:
+
+  - KVM_EXIT_XEN_HCALL -- synchronously notify user-space about Xen hypercall.
+    Userspace is expected to place the hypercall result into the appropriate
+    field before invoking KVM_RUN again.
+
 ::
 
                /* Fix the size of the union. */
@@ -6040,6 +6178,53 @@ KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR exit notifications which user space
 can then handle to implement model specific MSR handling and/or user notifications
 to inform a user that an MSR was not handled.
 
+7.22 KVM_CAP_X86_BUS_LOCK_EXIT
+-------------------------------
+
+:Architectures: x86
+:Target: VM
+:Parameters: args[0] defines the policy used when bus locks detected in guest
+:Returns: 0 on success, -EINVAL when args[0] contains invalid bits
+
+Valid bits in args[0] are::
+
+  #define KVM_BUS_LOCK_DETECTION_OFF      (1 << 0)
+  #define KVM_BUS_LOCK_DETECTION_EXIT     (1 << 1)
+
+Enabling this capability on a VM provides userspace with a way to select
+a policy to handle the bus locks detected in guest. Userspace can obtain
+the supported modes from the result of KVM_CHECK_EXTENSION and define it
+through the KVM_ENABLE_CAP.
+
+KVM_BUS_LOCK_DETECTION_OFF and KVM_BUS_LOCK_DETECTION_EXIT are supported
+currently and mutually exclusive with each other. More bits can be added in
+the future.
+
+With KVM_BUS_LOCK_DETECTION_OFF set, bus locks in guest will not cause vm exits
+so that no additional actions are needed. This is the default mode.
+
+With KVM_BUS_LOCK_DETECTION_EXIT set, vm exits happen when bus lock detected
+in VM. KVM just exits to userspace when handling them. Userspace can enforce
+its own throttling or other policy based mitigations.
+
+This capability is aimed to address the thread that VM can exploit bus locks to
+degree the performance of the whole system. Once the userspace enable this
+capability and select the KVM_BUS_LOCK_DETECTION_EXIT mode, KVM will set the
+KVM_RUN_BUS_LOCK flag in vcpu-run->flags field and exit to userspace. Concerning
+the bus lock vm exit can be preempted by a higher priority VM exit, the exit
+notifications to userspace can be KVM_EXIT_BUS_LOCK or other reasons.
+KVM_RUN_BUS_LOCK flag is used to distinguish between them.
+
+7.22 KVM_CAP_PPC_DAWR1
+----------------------
+
+:Architectures: ppc
+:Parameters: none
+:Returns: 0 on success, -EINVAL when CPU doesn't support 2nd DAWR
+
+This capability can be used to check / enable 2nd DAWR feature provided
+by POWER10 processor.
+
 8. Other capabilities.
 ======================
 
@@ -6417,7 +6602,6 @@ guest according to the bits in the KVM_CPUID_FEATURES CPUID leaf
 (0x40000001). Otherwise, a guest may use the paravirtual features
 regardless of what has actually been exposed through the CPUID leaf.
 
-
 8.29 KVM_CAP_DIRTY_LOG_RING
 ---------------------------
 
@@ -6504,3 +6688,29 @@ KVM_GET_DIRTY_LOG and KVM_CLEAR_DIRTY_LOG.  After enabling
 KVM_CAP_DIRTY_LOG_RING with an acceptable dirty ring size, the virtual
 machine will switch to ring-buffer dirty page tracking and further
 KVM_GET_DIRTY_LOG or KVM_CLEAR_DIRTY_LOG ioctls will fail.
+
+8.30 KVM_CAP_XEN_HVM
+--------------------
+
+:Architectures: x86
+
+This capability indicates the features that Xen supports for hosting Xen
+PVHVM guests. Valid flags are::
+
+  #define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR     (1 << 0)
+  #define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL   (1 << 1)
+  #define KVM_XEN_HVM_CONFIG_SHARED_INFO       (1 << 2)
+
+The KVM_XEN_HVM_CONFIG_HYPERCALL_MSR flag indicates that the KVM_XEN_HVM_CONFIG
+ioctl is available, for the guest to set its hypercall page.
+
+If KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL is also set, the same flag may also be
+provided in the flags to KVM_XEN_HVM_CONFIG, without providing hypercall page
+contents, to request that KVM generate hypercall page content automatically
+and also enable interception of guest hypercalls with KVM_EXIT_XEN.
+
+The KVM_XEN_HVM_CONFIG_SHARED_INFO flag indicates the availability of the
+KVM_XEN_HVM_SET_ATTR, KVM_XEN_HVM_GET_ATTR, KVM_XEN_VCPU_SET_ATTR and
+KVM_XEN_VCPU_GET_ATTR ioctls, as well as the delivery of exception vectors
+for event channel upcalls when the evtchn_upcall_pending field of a vcpu's
+vcpu_info is set.
index b21a34c..0aa4817 100644 (file)
@@ -16,7 +16,14 @@ The acquisition orders for mutexes are as follows:
 - kvm->slots_lock is taken outside kvm->irq_lock, though acquiring
   them together is quite rare.
 
-On x86, vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock.
+On x86:
+
+- vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock
+
+- kvm->arch.mmu_lock is an rwlock.  kvm->arch.tdp_mmu_pages_lock is
+  taken inside kvm->arch.mmu_lock, and cannot be taken without already
+  holding kvm->arch.mmu_lock (typically with ``read_lock``, otherwise
+  there's no need to take kvm->arch.tdp_mmu_pages_lock at all).
 
 Everything else is a leaf: no other lock is taken inside the critical
 sections.
index 6ab4e35..ac2095d 100644 (file)
@@ -37,8 +37,10 @@ call L2.
 Running nested VMX
 ------------------
 
-The nested VMX feature is disabled by default. It can be enabled by giving
-the "nested=1" option to the kvm-intel module.
+The nested VMX feature is enabled by default since Linux kernel v4.20. For
+older Linux kernel, it can be enabled by giving the "nested=1" option to the
+kvm-intel module.
+
 
 No modifications are required to user space (qemu). However, qemu's default
 emulated CPU type (qemu64) does not list the "VMX" CPU feature, so it must be
index d0a1fc7..bd70c69 100644 (file)
@@ -74,7 +74,7 @@ few:
 Enabling "nested" (x86)
 -----------------------
 
-From Linux kernel v4.19 onwards, the ``nested`` KVM parameter is enabled
+From Linux kernel v4.20 onwards, the ``nested`` KVM parameter is enabled
 by default for Intel and AMD.  (Though your Linux distribution might
 override this default.)
 
index 084d11a..3d10e65 100644 (file)
@@ -30,7 +30,6 @@
 
 #define __KVM_HAVE_ARCH_INTC_INITIALIZED
 
-#define KVM_USER_MEM_SLOTS 512
 #define KVM_HALT_POLL_NS_DEFAULT 500000
 
 #include <kvm/arm_vgic.h>
index 9083d69..0525c0b 100644 (file)
@@ -5,8 +5,8 @@
 #ifndef __ASM_SPINLOCK_H
 #define __ASM_SPINLOCK_H
 
-#include <asm/qrwlock.h>
 #include <asm/qspinlock.h>
+#include <asm/qrwlock.h>
 
 /* See include/linux/spinlock.h */
 #define smp_mb__after_spinlock()       smp_mb()
index 3dbc0c6..50be6f4 100644 (file)
@@ -47,6 +47,8 @@ __invalid:
        b       .
 
        /*
+        * Only uses x0..x3 so as to not clobber callee-saved SMCCC registers.
+        *
         * x0: SMCCC function ID
         * x1: struct kvm_nvhe_init_params PA
         */
@@ -63,9 +65,9 @@ __do_hyp_init:
        eret
 
 1:     mov     x0, x1
-       mov     x4, lr
-       bl      ___kvm_hyp_init
-       mov     lr, x4
+       mov     x3, lr
+       bl      ___kvm_hyp_init                 // Clobbers x0..x2
+       mov     lr, x3
 
        /* Hello, World! */
        mov     x0, #SMCCC_RET_SUCCESS
@@ -75,8 +77,8 @@ SYM_CODE_END(__kvm_hyp_init)
 /*
  * Initialize the hypervisor in EL2.
  *
- * Only uses x0..x3 so as to not clobber callee-saved SMCCC registers
- * and leave x4 for the caller.
+ * Only uses x0..x2 so as to not clobber callee-saved SMCCC registers
+ * and leave x3 for the caller.
  *
  * x0: struct kvm_nvhe_init_params PA
  */
@@ -105,9 +107,9 @@ alternative_else_nop_endif
        /*
         * Set the PS bits in TCR_EL2.
         */
-       ldr     x1, [x0, #NVHE_INIT_TCR_EL2]
-       tcr_compute_pa_size x1, #TCR_EL2_PS_SHIFT, x2, x3
-       msr     tcr_el2, x1
+       ldr     x0, [x0, #NVHE_INIT_TCR_EL2]
+       tcr_compute_pa_size x0, #TCR_EL2_PS_SHIFT, x1, x2
+       msr     tcr_el2, x0
 
        isb
 
@@ -185,7 +187,7 @@ SYM_CODE_START_LOCAL(__kvm_hyp_init_cpu)
 
        /* Enable MMU, set vectors and stack. */
        mov     x0, x28
-       bl      ___kvm_hyp_init                 // Clobbers x0..x3
+       bl      ___kvm_hyp_init                 // Clobbers x0..x2
 
        /* Leave idmap. */
        mov     x0, x29
index 24f3d0f..3a5612e 100644 (file)
@@ -83,7 +83,6 @@
 
 
 #define KVM_MAX_VCPUS          16
-#define KVM_USER_MEM_SLOTS     16
 /* memory slots that does not exposed to userspace */
 #define KVM_PRIVATE_MEM_SLOTS  0
 
index 8a88eb2..6ce2117 100644 (file)
@@ -10,7 +10,6 @@
 #define _ASM_SPINLOCK_H
 
 #include <asm/processor.h>
-#include <asm/qrwlock.h>
 
 #include <asm-generic/qspinlock_types.h>
 
@@ -27,5 +26,6 @@ static inline void queued_spin_unlock(struct qspinlock *lock)
 }
 
 #include <asm/qspinlock.h>
+#include <asm/qrwlock.h>
 
 #endif /* _ASM_SPINLOCK_H */
index c98f514..ed6086d 100644 (file)
@@ -535,9 +535,12 @@ struct h_cpu_char_result {
        u64 behaviour;
 };
 
-/* Register state for entering a nested guest with H_ENTER_NESTED */
+/*
+ * Register state for entering a nested guest with H_ENTER_NESTED.
+ * New member must be added at the end.
+ */
 struct hv_guest_state {
-       u64 version;            /* version of this structure layout */
+       u64 version;            /* version of this structure layout, must be first */
        u32 lpid;
        u32 vcpu_token;
        /* These registers are hypervisor privileged (at least for writing) */
@@ -566,10 +569,26 @@ struct hv_guest_state {
        u64 pidr;
        u64 cfar;
        u64 ppr;
+       /* Version 1 ends here */
+       u64 dawr1;
+       u64 dawrx1;
+       /* Version 2 ends here */
 };
 
 /* Latest version of hv_guest_state structure */
-#define HV_GUEST_STATE_VERSION 1
+#define HV_GUEST_STATE_VERSION 2
+
+static inline int hv_guest_state_size(unsigned int version)
+{
+       switch (version) {
+       case 1:
+               return offsetofend(struct hv_guest_state, ppr);
+       case 2:
+               return offsetofend(struct hv_guest_state, dawrx1);
+       default:
+               return -1;
+       }
+}
 
 /*
  * From the document "H_GetPerformanceCounterInfo Interface" v1.07
index 078f464..b6d31bf 100644 (file)
@@ -74,16 +74,6 @@ struct kvm_split_mode {
        u8              do_nap;
        u8              napped[MAX_SMT_THREADS];
        struct kvmppc_vcore *vc[MAX_SUBCORES];
-       /* Bits for changing lpcr on P9 */
-       unsigned long   lpcr_req;
-       unsigned long   lpidr_req;
-       unsigned long   host_lpcr;
-       u32             do_set;
-       u32             do_restore;
-       union {
-               u32     allphases;
-               u8      phase[4];
-       } lpcr_sync;
 };
 
 /*
@@ -110,7 +100,6 @@ struct kvmppc_host_state {
        u8 hwthread_state;
        u8 host_ipi;
        u8 ptid;                /* thread number within subcore when split */
-       u8 tid;                 /* thread number within whole core */
        u8 fake_suspend;
        struct kvm_vcpu *kvm_vcpu;
        struct kvmppc_vcore *kvm_vcore;
index d67a470..05fb00d 100644 (file)
@@ -28,7 +28,6 @@
 
 #define KVM_MAX_VCPUS          NR_CPUS
 #define KVM_MAX_VCORES         NR_CPUS
-#define KVM_USER_MEM_SLOTS     512
 
 #include <asm/cputhreads.h>
 
@@ -307,6 +306,7 @@ struct kvm_arch {
        u8 svm_enabled;
        bool threads_indep;
        bool nested_enable;
+       bool dawr1_enabled;
        pgd_t *pgtable;
        u64 process_table;
        struct dentry *debugfs_dir;
@@ -584,8 +584,10 @@ struct kvm_vcpu_arch {
        u32 ctrl;
        u32 dabrx;
        ulong dabr;
-       ulong dawr;
-       ulong dawrx;
+       ulong dawr0;
+       ulong dawrx0;
+       ulong dawr1;
+       ulong dawrx1;
        ulong ciabr;
        ulong cfar;
        ulong ppr;
index 0a056c6..df4bda8 100644 (file)
@@ -314,6 +314,8 @@ struct kvmppc_ops {
                              int size);
        int (*enable_svm)(struct kvm *kvm);
        int (*svm_off)(struct kvm *kvm);
+       int (*enable_dawr1)(struct kvm *kvm);
+       bool (*hash_v3_possible)(void);
 };
 
 extern struct kvmppc_ops *kvmppc_hv_ops;
index c3af3f3..9f18fa0 100644 (file)
@@ -644,6 +644,8 @@ struct kvm_ppc_cpu_char {
 #define KVM_REG_PPC_MMCR3      (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc1)
 #define KVM_REG_PPC_SIER2      (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc2)
 #define KVM_REG_PPC_SIER3      (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc3)
+#define KVM_REG_PPC_DAWR1      (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc4)
+#define KVM_REG_PPC_DAWRX1     (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc5)
 
 /* Transactional Memory checkpointed state:
  * This is all GPRs, all VSX regs and a subset of SPRs
index b12d7c0..b690c70 100644 (file)
@@ -526,8 +526,10 @@ int main(void)
        OFFSET(VCPU_CTRL, kvm_vcpu, arch.ctrl);
        OFFSET(VCPU_DABR, kvm_vcpu, arch.dabr);
        OFFSET(VCPU_DABRX, kvm_vcpu, arch.dabrx);
-       OFFSET(VCPU_DAWR, kvm_vcpu, arch.dawr);
-       OFFSET(VCPU_DAWRX, kvm_vcpu, arch.dawrx);
+       OFFSET(VCPU_DAWR0, kvm_vcpu, arch.dawr0);
+       OFFSET(VCPU_DAWRX0, kvm_vcpu, arch.dawrx0);
+       OFFSET(VCPU_DAWR1, kvm_vcpu, arch.dawr1);
+       OFFSET(VCPU_DAWRX1, kvm_vcpu, arch.dawrx1);
        OFFSET(VCPU_CIABR, kvm_vcpu, arch.ciabr);
        OFFSET(VCPU_HFLAGS, kvm_vcpu, arch.hflags);
        OFFSET(VCPU_DEC, kvm_vcpu, arch.dec);
@@ -668,7 +670,6 @@ int main(void)
        HSTATE_FIELD(HSTATE_SAVED_XIRR, saved_xirr);
        HSTATE_FIELD(HSTATE_HOST_IPI, host_ipi);
        HSTATE_FIELD(HSTATE_PTID, ptid);
-       HSTATE_FIELD(HSTATE_TID, tid);
        HSTATE_FIELD(HSTATE_FAKE_SUSPEND, fake_suspend);
        HSTATE_FIELD(HSTATE_MMCR0, host_mmcr[0]);
        HSTATE_FIELD(HSTATE_MMCR1, host_mmcr[1]);
@@ -698,8 +699,6 @@ int main(void)
        OFFSET(KVM_SPLIT_LDBAR, kvm_split_mode, ldbar);
        OFFSET(KVM_SPLIT_DO_NAP, kvm_split_mode, do_nap);
        OFFSET(KVM_SPLIT_NAPPED, kvm_split_mode, napped);
-       OFFSET(KVM_SPLIT_DO_SET, kvm_split_mode, do_set);
-       OFFSET(KVM_SPLIT_DO_RESTORE, kvm_split_mode, do_restore);
 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
 
 #ifdef CONFIG_PPC_BOOK3S_64
index 6f612d2..f09708d 100644 (file)
@@ -134,7 +134,7 @@ static inline bool nesting_enabled(struct kvm *kvm)
 }
 
 /* If set, the threads on each CPU core have to be in the same MMU mode */
-static bool no_mixing_hpt_and_radix;
+static bool no_mixing_hpt_and_radix __read_mostly;
 
 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
 
@@ -782,8 +782,24 @@ static int kvmppc_h_set_mode(struct kvm_vcpu *vcpu, unsigned long mflags,
                        return H_UNSUPPORTED_FLAG_START;
                if (value2 & DABRX_HYP)
                        return H_P4;
-               vcpu->arch.dawr  = value1;
-               vcpu->arch.dawrx = value2;
+               vcpu->arch.dawr0  = value1;
+               vcpu->arch.dawrx0 = value2;
+               return H_SUCCESS;
+       case H_SET_MODE_RESOURCE_SET_DAWR1:
+               if (!kvmppc_power8_compatible(vcpu))
+                       return H_P2;
+               if (!ppc_breakpoint_available())
+                       return H_P2;
+               if (!cpu_has_feature(CPU_FTR_DAWR1))
+                       return H_P2;
+               if (!vcpu->kvm->arch.dawr1_enabled)
+                       return H_FUNCTION;
+               if (mflags)
+                       return H_UNSUPPORTED_FLAG_START;
+               if (value2 & DABRX_HYP)
+                       return H_P4;
+               vcpu->arch.dawr1  = value1;
+               vcpu->arch.dawrx1 = value2;
                return H_SUCCESS;
        case H_SET_MODE_RESOURCE_ADDR_TRANS_MODE:
                /* KVM does not support mflags=2 (AIL=2) */
@@ -1759,10 +1775,16 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
                *val = get_reg_val(id, vcpu->arch.vcore->vtb);
                break;
        case KVM_REG_PPC_DAWR:
-               *val = get_reg_val(id, vcpu->arch.dawr);
+               *val = get_reg_val(id, vcpu->arch.dawr0);
                break;
        case KVM_REG_PPC_DAWRX:
-               *val = get_reg_val(id, vcpu->arch.dawrx);
+               *val = get_reg_val(id, vcpu->arch.dawrx0);
+               break;
+       case KVM_REG_PPC_DAWR1:
+               *val = get_reg_val(id, vcpu->arch.dawr1);
+               break;
+       case KVM_REG_PPC_DAWRX1:
+               *val = get_reg_val(id, vcpu->arch.dawrx1);
                break;
        case KVM_REG_PPC_CIABR:
                *val = get_reg_val(id, vcpu->arch.ciabr);
@@ -1991,10 +2013,16 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
                vcpu->arch.vcore->vtb = set_reg_val(id, *val);
                break;
        case KVM_REG_PPC_DAWR:
-               vcpu->arch.dawr = set_reg_val(id, *val);
+               vcpu->arch.dawr0 = set_reg_val(id, *val);
                break;
        case KVM_REG_PPC_DAWRX:
-               vcpu->arch.dawrx = set_reg_val(id, *val) & ~DAWRX_HYP;
+               vcpu->arch.dawrx0 = set_reg_val(id, *val) & ~DAWRX_HYP;
+               break;
+       case KVM_REG_PPC_DAWR1:
+               vcpu->arch.dawr1 = set_reg_val(id, *val);
+               break;
+       case KVM_REG_PPC_DAWRX1:
+               vcpu->arch.dawrx1 = set_reg_val(id, *val) & ~DAWRX_HYP;
                break;
        case KVM_REG_PPC_CIABR:
                vcpu->arch.ciabr = set_reg_val(id, *val);
@@ -2862,11 +2890,6 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
        if (one_vm_per_core && vc->kvm != cip->vc[0]->kvm)
                return false;
 
-       /* Some POWER9 chips require all threads to be in the same MMU mode */
-       if (no_mixing_hpt_and_radix &&
-           kvm_is_radix(vc->kvm) != kvm_is_radix(cip->vc[0]->kvm))
-               return false;
-
        if (n_threads < cip->max_subcore_threads)
                n_threads = cip->max_subcore_threads;
        if (!subcore_config_ok(cip->n_subcores + 1, n_threads))
@@ -2905,6 +2928,9 @@ static void prepare_threads(struct kvmppc_vcore *vc)
        for_each_runnable_thread(i, vcpu, vc) {
                if (signal_pending(vcpu->arch.run_task))
                        vcpu->arch.ret = -EINTR;
+               else if (no_mixing_hpt_and_radix &&
+                        kvm_is_radix(vc->kvm) != radix_enabled())
+                       vcpu->arch.ret = -EINVAL;
                else if (vcpu->arch.vpa.update_pending ||
                         vcpu->arch.slb_shadow.update_pending ||
                         vcpu->arch.dtl.update_pending)
@@ -3110,7 +3136,6 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
        int controlled_threads;
        int trap;
        bool is_power8;
-       bool hpt_on_radix;
 
        /*
         * Remove from the list any threads that have a signal pending
@@ -3143,11 +3168,8 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
         * this is a HPT guest on a radix host machine where the
         * CPU threads may not be in different MMU modes.
         */
-       hpt_on_radix = no_mixing_hpt_and_radix && radix_enabled() &&
-               !kvm_is_radix(vc->kvm);
-       if (((controlled_threads > 1) &&
-            ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) ||
-           (hpt_on_radix && vc->kvm->arch.threads_indep)) {
+       if ((controlled_threads > 1) &&
+           ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) {
                for_each_runnable_thread(i, vcpu, vc) {
                        vcpu->arch.ret = -EBUSY;
                        kvmppc_remove_runnable(vc, vcpu);
@@ -3215,7 +3237,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
        is_power8 = cpu_has_feature(CPU_FTR_ARCH_207S)
                && !cpu_has_feature(CPU_FTR_ARCH_300);
 
-       if (split > 1 || hpt_on_radix) {
+       if (split > 1) {
                sip = &split_info;
                memset(&split_info, 0, sizeof(split_info));
                for (sub = 0; sub < core_info.n_subcores; ++sub)
@@ -3237,13 +3259,6 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
                        split_info.subcore_size = subcore_size;
                } else {
                        split_info.subcore_size = 1;
-                       if (hpt_on_radix) {
-                               /* Use the split_info for LPCR/LPIDR changes */
-                               split_info.lpcr_req = vc->lpcr;
-                               split_info.lpidr_req = vc->kvm->arch.lpid;
-                               split_info.host_lpcr = vc->kvm->arch.host_lpcr;
-                               split_info.do_set = 1;
-                       }
                }
 
                /* order writes to split_info before kvm_split_mode pointer */
@@ -3253,7 +3268,6 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
        for (thr = 0; thr < controlled_threads; ++thr) {
                struct paca_struct *paca = paca_ptrs[pcpu + thr];
 
-               paca->kvm_hstate.tid = thr;
                paca->kvm_hstate.napping = 0;
                paca->kvm_hstate.kvm_split_mode = sip;
        }
@@ -3327,10 +3341,8 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
         * When doing micro-threading, poke the inactive threads as well.
         * This gets them to the nap instruction after kvm_do_nap,
         * which reduces the time taken to unsplit later.
-        * For POWER9 HPT guest on radix host, we need all the secondary
-        * threads woken up so they can do the LPCR/LPIDR change.
         */
-       if (cmd_bit || hpt_on_radix) {
+       if (cmd_bit) {
                split_info.do_nap = 1;  /* ask secondaries to nap when done */
                for (thr = 1; thr < threads_per_subcore; ++thr)
                        if (!(active & (1 << thr)))
@@ -3391,19 +3403,8 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
                        cpu_relax();
                        ++loops;
                }
-       } else if (hpt_on_radix) {
-               /* Wait for all threads to have seen final sync */
-               for (thr = 1; thr < controlled_threads; ++thr) {
-                       struct paca_struct *paca = paca_ptrs[pcpu + thr];
-
-                       while (paca->kvm_hstate.kvm_split_mode) {
-                               HMT_low();
-                               barrier();
-                       }
-                       HMT_medium();
-               }
+               split_info.do_nap = 0;
        }
-       split_info.do_nap = 0;
 
        kvmppc_set_host_core(pcpu);
 
@@ -3449,10 +3450,17 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
        int trap;
        unsigned long host_hfscr = mfspr(SPRN_HFSCR);
        unsigned long host_ciabr = mfspr(SPRN_CIABR);
-       unsigned long host_dawr = mfspr(SPRN_DAWR0);
-       unsigned long host_dawrx = mfspr(SPRN_DAWRX0);
+       unsigned long host_dawr0 = mfspr(SPRN_DAWR0);
+       unsigned long host_dawrx0 = mfspr(SPRN_DAWRX0);
        unsigned long host_psscr = mfspr(SPRN_PSSCR);
        unsigned long host_pidr = mfspr(SPRN_PID);
+       unsigned long host_dawr1 = 0;
+       unsigned long host_dawrx1 = 0;
+
+       if (cpu_has_feature(CPU_FTR_DAWR1)) {
+               host_dawr1 = mfspr(SPRN_DAWR1);
+               host_dawrx1 = mfspr(SPRN_DAWRX1);
+       }
 
        /*
         * P8 and P9 suppress the HDEC exception when LPCR[HDICE] = 0,
@@ -3489,8 +3497,12 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
        mtspr(SPRN_SPURR, vcpu->arch.spurr);
 
        if (dawr_enabled()) {
-               mtspr(SPRN_DAWR0, vcpu->arch.dawr);
-               mtspr(SPRN_DAWRX0, vcpu->arch.dawrx);
+               mtspr(SPRN_DAWR0, vcpu->arch.dawr0);
+               mtspr(SPRN_DAWRX0, vcpu->arch.dawrx0);
+               if (cpu_has_feature(CPU_FTR_DAWR1)) {
+                       mtspr(SPRN_DAWR1, vcpu->arch.dawr1);
+                       mtspr(SPRN_DAWRX1, vcpu->arch.dawrx1);
+               }
        }
        mtspr(SPRN_CIABR, vcpu->arch.ciabr);
        mtspr(SPRN_IC, vcpu->arch.ic);
@@ -3542,8 +3554,12 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
              (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG));
        mtspr(SPRN_HFSCR, host_hfscr);
        mtspr(SPRN_CIABR, host_ciabr);
-       mtspr(SPRN_DAWR0, host_dawr);
-       mtspr(SPRN_DAWRX0, host_dawrx);
+       mtspr(SPRN_DAWR0, host_dawr0);
+       mtspr(SPRN_DAWRX0, host_dawrx0);
+       if (cpu_has_feature(CPU_FTR_DAWR1)) {
+               mtspr(SPRN_DAWR1, host_dawr1);
+               mtspr(SPRN_DAWRX1, host_dawrx1);
+       }
        mtspr(SPRN_PID, host_pidr);
 
        /*
@@ -3595,6 +3611,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
        unsigned long host_tidr = mfspr(SPRN_TIDR);
        unsigned long host_iamr = mfspr(SPRN_IAMR);
        unsigned long host_amr = mfspr(SPRN_AMR);
+       unsigned long host_fscr = mfspr(SPRN_FSCR);
        s64 dec;
        u64 tb;
        int trap, save_pmu;
@@ -3735,6 +3752,9 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
        if (host_amr != vcpu->arch.amr)
                mtspr(SPRN_AMR, host_amr);
 
+       if (host_fscr != vcpu->arch.fscr)
+               mtspr(SPRN_FSCR, host_fscr);
+
        msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX);
        store_fp_state(&vcpu->arch.fp);
 #ifdef CONFIG_ALTIVEC
@@ -4173,7 +4193,6 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
 
        kvmppc_clear_host_core(pcpu);
 
-       local_paca->kvm_hstate.tid = 0;
        local_paca->kvm_hstate.napping = 0;
        local_paca->kvm_hstate.kvm_split_mode = NULL;
        kvmppc_start_thread(vcpu, vc);
@@ -4358,15 +4377,11 @@ static int kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu)
 
        do {
                /*
-                * The early POWER9 chips that can't mix radix and HPT threads
-                * on the same core also need the workaround for the problem
-                * where the TLB would prefetch entries in the guest exit path
-                * for radix guests using the guest PIDR value and LPID 0.
-                * The workaround is in the old path (kvmppc_run_vcpu())
-                * but not the new path (kvmhv_run_single_vcpu()).
+                * The TLB prefetch bug fixup is only in the kvmppc_run_vcpu
+                * path, which also handles hash and dependent threads mode.
                 */
                if (kvm->arch.threads_indep && kvm_is_radix(kvm) &&
-                   !no_mixing_hpt_and_radix)
+                   !cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG))
                        r = kvmhv_run_single_vcpu(vcpu, ~(u64)0,
                                                  vcpu->arch.vcore->lpcr);
                else
@@ -5599,6 +5614,26 @@ out:
        return ret;
 }
 
+static int kvmhv_enable_dawr1(struct kvm *kvm)
+{
+       if (!cpu_has_feature(CPU_FTR_DAWR1))
+               return -ENODEV;
+
+       /* kvm == NULL means the caller is testing if the capability exists */
+       if (kvm)
+               kvm->arch.dawr1_enabled = true;
+       return 0;
+}
+
+static bool kvmppc_hash_v3_possible(void)
+{
+       if (radix_enabled() && no_mixing_hpt_and_radix)
+               return false;
+
+       return cpu_has_feature(CPU_FTR_ARCH_300) &&
+               cpu_has_feature(CPU_FTR_HVMODE);
+}
+
 static struct kvmppc_ops kvm_ops_hv = {
        .get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv,
        .set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv,
@@ -5642,6 +5677,8 @@ static struct kvmppc_ops kvm_ops_hv = {
        .store_to_eaddr = kvmhv_store_to_eaddr,
        .enable_svm = kvmhv_enable_svm,
        .svm_off = kvmhv_svm_off,
+       .enable_dawr1 = kvmhv_enable_dawr1,
+       .hash_v3_possible = kvmppc_hash_v3_possible,
 };
 
 static int kvm_init_subcore_bitmap(void)
index 8053efd..f3d3183 100644 (file)
@@ -277,8 +277,7 @@ void kvmhv_commence_exit(int trap)
        struct kvmppc_vcore *vc = local_paca->kvm_hstate.kvm_vcore;
        int ptid = local_paca->kvm_hstate.ptid;
        struct kvm_split_mode *sip = local_paca->kvm_hstate.kvm_split_mode;
-       int me, ee, i, t;
-       int cpu0;
+       int me, ee, i;
 
        /* Set our bit in the threads-exiting-guest map in the 0xff00
           bits of vcore->entry_exit_map */
@@ -320,22 +319,6 @@ void kvmhv_commence_exit(int trap)
                if ((ee >> 8) == 0)
                        kvmhv_interrupt_vcore(vc, ee);
        }
-
-       /*
-        * On POWER9 when running a HPT guest on a radix host (sip != NULL),
-        * we have to interrupt inactive CPU threads to get them to
-        * restore the host LPCR value.
-        */
-       if (sip->lpcr_req) {
-               if (cmpxchg(&sip->do_restore, 0, 1) == 0) {
-                       vc = local_paca->kvm_hstate.kvm_vcore;
-                       cpu0 = vc->pcpu + ptid - local_paca->kvm_hstate.tid;
-                       for (t = 1; t < threads_per_core; ++t) {
-                               if (sip->napped[t])
-                                       kvmhv_rm_send_ipi(cpu0 + t);
-                       }
-               }
-       }
 }
 
 struct kvmppc_host_rm_ops *kvmppc_host_rm_ops_hv;
@@ -667,95 +650,6 @@ void kvmppc_bad_interrupt(struct pt_regs *regs)
        panic("Bad KVM trap");
 }
 
-/*
- * Functions used to switch LPCR HR and UPRT bits on all threads
- * when entering and exiting HPT guests on a radix host.
- */
-
-#define PHASE_REALMODE         1       /* in real mode */
-#define PHASE_SET_LPCR         2       /* have set LPCR */
-#define PHASE_OUT_OF_GUEST     4       /* have finished executing in guest */
-#define PHASE_RESET_LPCR       8       /* have reset LPCR to host value */
-
-#define ALL(p)         (((p) << 24) | ((p) << 16) | ((p) << 8) | (p))
-
-static void wait_for_sync(struct kvm_split_mode *sip, int phase)
-{
-       int thr = local_paca->kvm_hstate.tid;
-
-       sip->lpcr_sync.phase[thr] |= phase;
-       phase = ALL(phase);
-       while ((sip->lpcr_sync.allphases & phase) != phase) {
-               HMT_low();
-               barrier();
-       }
-       HMT_medium();
-}
-
-void kvmhv_p9_set_lpcr(struct kvm_split_mode *sip)
-{
-       int num_sets;
-       unsigned long rb, set;
-
-       /* wait for every other thread to get to real mode */
-       wait_for_sync(sip, PHASE_REALMODE);
-
-       /* Set LPCR and LPIDR */
-       mtspr(SPRN_LPCR, sip->lpcr_req);
-       mtspr(SPRN_LPID, sip->lpidr_req);
-       isync();
-
-       /*
-        * P10 will flush all the congruence class with a single tlbiel
-        */
-       if (cpu_has_feature(CPU_FTR_ARCH_31))
-               num_sets =  1;
-       else
-               num_sets = POWER9_TLB_SETS_RADIX;
-
-       /* Invalidate the TLB on thread 0 */
-       if (local_paca->kvm_hstate.tid == 0) {
-               sip->do_set = 0;
-               asm volatile("ptesync" : : : "memory");
-               for (set = 0; set < num_sets; ++set) {
-                       rb = TLBIEL_INVAL_SET_LPID +
-                               (set << TLBIEL_INVAL_SET_SHIFT);
-                       asm volatile(PPC_TLBIEL(%0, %1, 0, 0, 0) : :
-                                    "r" (rb), "r" (0));
-               }
-               asm volatile("ptesync" : : : "memory");
-       }
-
-       /* indicate that we have done so and wait for others */
-       wait_for_sync(sip, PHASE_SET_LPCR);
-       /* order read of sip->lpcr_sync.allphases vs. sip->do_set */
-       smp_rmb();
-}
-
-/*
- * Called when a thread that has been in the guest needs
- * to reload the host LPCR value - but only on POWER9 when
- * running a HPT guest on a radix host.
- */
-void kvmhv_p9_restore_lpcr(struct kvm_split_mode *sip)
-{
-       /* we're out of the guest... */
-       wait_for_sync(sip, PHASE_OUT_OF_GUEST);
-
-       mtspr(SPRN_LPID, 0);
-       mtspr(SPRN_LPCR, sip->host_lpcr);
-       isync();
-
-       if (local_paca->kvm_hstate.tid == 0) {
-               sip->do_restore = 0;
-               smp_wmb();      /* order store of do_restore vs. phase */
-       }
-
-       wait_for_sync(sip, PHASE_RESET_LPCR);
-       smp_mb();
-       local_paca->kvm_hstate.kvm_split_mode = NULL;
-}
-
 static void kvmppc_end_cede(struct kvm_vcpu *vcpu)
 {
        vcpu->arch.ceded = 0;
index 33b5854..0cd0e7a 100644 (file)
@@ -33,8 +33,8 @@ void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
        hr->dpdes = vc->dpdes;
        hr->hfscr = vcpu->arch.hfscr;
        hr->tb_offset = vc->tb_offset;
-       hr->dawr0 = vcpu->arch.dawr;
-       hr->dawrx0 = vcpu->arch.dawrx;
+       hr->dawr0 = vcpu->arch.dawr0;
+       hr->dawrx0 = vcpu->arch.dawrx0;
        hr->ciabr = vcpu->arch.ciabr;
        hr->purr = vcpu->arch.purr;
        hr->spurr = vcpu->arch.spurr;
@@ -49,6 +49,8 @@ void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
        hr->pidr = vcpu->arch.pid;
        hr->cfar = vcpu->arch.cfar;
        hr->ppr = vcpu->arch.ppr;
+       hr->dawr1 = vcpu->arch.dawr1;
+       hr->dawrx1 = vcpu->arch.dawrx1;
 }
 
 static void byteswap_pt_regs(struct pt_regs *regs)
@@ -91,6 +93,8 @@ static void byteswap_hv_regs(struct hv_guest_state *hr)
        hr->pidr = swab64(hr->pidr);
        hr->cfar = swab64(hr->cfar);
        hr->ppr = swab64(hr->ppr);
+       hr->dawr1 = swab64(hr->dawr1);
+       hr->dawrx1 = swab64(hr->dawrx1);
 }
 
 static void save_hv_return_state(struct kvm_vcpu *vcpu, int trap,
@@ -138,6 +142,7 @@ static void sanitise_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
 
        /* Don't let data address watchpoint match in hypervisor state */
        hr->dawrx0 &= ~DAWRX_HYP;
+       hr->dawrx1 &= ~DAWRX_HYP;
 
        /* Don't let completed instruction address breakpt match in HV state */
        if ((hr->ciabr & CIABR_PRIV) == CIABR_PRIV_HYPER)
@@ -151,8 +156,8 @@ static void restore_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
        vc->pcr = hr->pcr | PCR_MASK;
        vc->dpdes = hr->dpdes;
        vcpu->arch.hfscr = hr->hfscr;
-       vcpu->arch.dawr = hr->dawr0;
-       vcpu->arch.dawrx = hr->dawrx0;
+       vcpu->arch.dawr0 = hr->dawr0;
+       vcpu->arch.dawrx0 = hr->dawrx0;
        vcpu->arch.ciabr = hr->ciabr;
        vcpu->arch.purr = hr->purr;
        vcpu->arch.spurr = hr->spurr;
@@ -167,6 +172,8 @@ static void restore_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
        vcpu->arch.pid = hr->pidr;
        vcpu->arch.cfar = hr->cfar;
        vcpu->arch.ppr = hr->ppr;
+       vcpu->arch.dawr1 = hr->dawr1;
+       vcpu->arch.dawrx1 = hr->dawrx1;
 }
 
 void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu,
@@ -215,12 +222,51 @@ static void kvmhv_nested_mmio_needed(struct kvm_vcpu *vcpu, u64 regs_ptr)
        }
 }
 
+static int kvmhv_read_guest_state_and_regs(struct kvm_vcpu *vcpu,
+                                          struct hv_guest_state *l2_hv,
+                                          struct pt_regs *l2_regs,
+                                          u64 hv_ptr, u64 regs_ptr)
+{
+       int size;
+
+       if (kvm_vcpu_read_guest(vcpu, hv_ptr, &l2_hv->version,
+                               sizeof(l2_hv->version)))
+               return -1;
+
+       if (kvmppc_need_byteswap(vcpu))
+               l2_hv->version = swab64(l2_hv->version);
+
+       size = hv_guest_state_size(l2_hv->version);
+       if (size < 0)
+               return -1;
+
+       return kvm_vcpu_read_guest(vcpu, hv_ptr, l2_hv, size) ||
+               kvm_vcpu_read_guest(vcpu, regs_ptr, l2_regs,
+                                   sizeof(struct pt_regs));
+}
+
+static int kvmhv_write_guest_state_and_regs(struct kvm_vcpu *vcpu,
+                                           struct hv_guest_state *l2_hv,
+                                           struct pt_regs *l2_regs,
+                                           u64 hv_ptr, u64 regs_ptr)
+{
+       int size;
+
+       size = hv_guest_state_size(l2_hv->version);
+       if (size < 0)
+               return -1;
+
+       return kvm_vcpu_write_guest(vcpu, hv_ptr, l2_hv, size) ||
+               kvm_vcpu_write_guest(vcpu, regs_ptr, l2_regs,
+                                    sizeof(struct pt_regs));
+}
+
 long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
 {
        long int err, r;
        struct kvm_nested_guest *l2;
        struct pt_regs l2_regs, saved_l1_regs;
-       struct hv_guest_state l2_hv, saved_l1_hv;
+       struct hv_guest_state l2_hv = {0}, saved_l1_hv;
        struct kvmppc_vcore *vc = vcpu->arch.vcore;
        u64 hv_ptr, regs_ptr;
        u64 hdec_exp;
@@ -235,17 +281,15 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
        hv_ptr = kvmppc_get_gpr(vcpu, 4);
        regs_ptr = kvmppc_get_gpr(vcpu, 5);
        vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
-       err = kvm_vcpu_read_guest(vcpu, hv_ptr, &l2_hv,
-                                 sizeof(struct hv_guest_state)) ||
-               kvm_vcpu_read_guest(vcpu, regs_ptr, &l2_regs,
-                                   sizeof(struct pt_regs));
+       err = kvmhv_read_guest_state_and_regs(vcpu, &l2_hv, &l2_regs,
+                                             hv_ptr, regs_ptr);
        srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
        if (err)
                return H_PARAMETER;
 
        if (kvmppc_need_byteswap(vcpu))
                byteswap_hv_regs(&l2_hv);
-       if (l2_hv.version != HV_GUEST_STATE_VERSION)
+       if (l2_hv.version > HV_GUEST_STATE_VERSION)
                return H_P2;
 
        if (kvmppc_need_byteswap(vcpu))
@@ -325,10 +369,8 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
                byteswap_pt_regs(&l2_regs);
        }
        vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
-       err = kvm_vcpu_write_guest(vcpu, hv_ptr, &l2_hv,
-                                  sizeof(struct hv_guest_state)) ||
-               kvm_vcpu_write_guest(vcpu, regs_ptr, &l2_regs,
-                                  sizeof(struct pt_regs));
+       err = kvmhv_write_guest_state_and_regs(vcpu, &l2_hv, &l2_regs,
+                                              hv_ptr, regs_ptr);
        srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
        if (err)
                return H_AUTHORITY;
index cd9995e..5e634db 100644 (file)
@@ -52,11 +52,13 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
 #define STACK_SLOT_PID         (SFS-32)
 #define STACK_SLOT_IAMR                (SFS-40)
 #define STACK_SLOT_CIABR       (SFS-48)
-#define STACK_SLOT_DAWR                (SFS-56)
-#define STACK_SLOT_DAWRX       (SFS-64)
+#define STACK_SLOT_DAWR0       (SFS-56)
+#define STACK_SLOT_DAWRX0      (SFS-64)
 #define STACK_SLOT_HFSCR       (SFS-72)
 #define STACK_SLOT_AMR         (SFS-80)
 #define STACK_SLOT_UAMOR       (SFS-88)
+#define STACK_SLOT_DAWR1       (SFS-96)
+#define STACK_SLOT_DAWRX1      (SFS-104)
 /* the following is used by the P9 short path */
 #define STACK_SLOT_NVGPRS      (SFS-152)       /* 18 gprs */
 
@@ -85,19 +87,6 @@ _GLOBAL_TOC(kvmppc_hv_entry_trampoline)
        RFI_TO_KERNEL
 
 kvmppc_call_hv_entry:
-BEGIN_FTR_SECTION
-       /* On P9, do LPCR setting, if necessary */
-       ld      r3, HSTATE_SPLIT_MODE(r13)
-       cmpdi   r3, 0
-       beq     46f
-       lwz     r4, KVM_SPLIT_DO_SET(r3)
-       cmpwi   r4, 0
-       beq     46f
-       bl      kvmhv_p9_set_lpcr
-       nop
-46:
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
-
        ld      r4, HSTATE_KVM_VCPU(r13)
        bl      kvmppc_hv_entry
 
@@ -361,11 +350,11 @@ kvm_secondary_got_guest:
        LOAD_REG_ADDR(r6, decrementer_max)
        ld      r6, 0(r6)
        mtspr   SPRN_HDEC, r6
+BEGIN_FTR_SECTION
        /* and set per-LPAR registers, if doing dynamic micro-threading */
        ld      r6, HSTATE_SPLIT_MODE(r13)
        cmpdi   r6, 0
        beq     63f
-BEGIN_FTR_SECTION
        ld      r0, KVM_SPLIT_RPR(r6)
        mtspr   SPRN_RPR, r0
        ld      r0, KVM_SPLIT_PMMAR(r6)
@@ -373,16 +362,7 @@ BEGIN_FTR_SECTION
        ld      r0, KVM_SPLIT_LDBAR(r6)
        mtspr   SPRN_LDBAR, r0
        isync
-FTR_SECTION_ELSE
-       /* On P9 we use the split_info for coordinating LPCR changes */
-       lwz     r4, KVM_SPLIT_DO_SET(r6)
-       cmpwi   r4, 0
-       beq     1f
-       mr      r3, r6
-       bl      kvmhv_p9_set_lpcr
-       nop
-1:
-ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
 63:
        /* Order load of vcpu after load of vcore */
        lwsync
@@ -452,19 +432,15 @@ kvm_no_guest:
        mtcr    r5
        blr
 
-53:    HMT_LOW
+53:
+BEGIN_FTR_SECTION
+       HMT_LOW
        ld      r5, HSTATE_KVM_VCORE(r13)
        cmpdi   r5, 0
        bne     60f
        ld      r3, HSTATE_SPLIT_MODE(r13)
        cmpdi   r3, 0
        beq     kvm_no_guest
-       lwz     r0, KVM_SPLIT_DO_SET(r3)
-       cmpwi   r0, 0
-       bne     kvmhv_do_set
-       lwz     r0, KVM_SPLIT_DO_RESTORE(r3)
-       cmpwi   r0, 0
-       bne     kvmhv_do_restore
        lbz     r0, KVM_SPLIT_DO_NAP(r3)
        cmpwi   r0, 0
        beq     kvm_no_guest
@@ -472,24 +448,19 @@ kvm_no_guest:
        b       kvm_unsplit_nap
 60:    HMT_MEDIUM
        b       kvm_secondary_got_guest
+FTR_SECTION_ELSE
+       HMT_LOW
+       ld      r5, HSTATE_KVM_VCORE(r13)
+       cmpdi   r5, 0
+       beq     kvm_no_guest
+       HMT_MEDIUM
+       b       kvm_secondary_got_guest
+ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
 
 54:    li      r0, KVM_HWTHREAD_IN_KVM
        stb     r0, HSTATE_HWTHREAD_STATE(r13)
        b       kvm_no_guest
 
-kvmhv_do_set:
-       /* Set LPCR, LPIDR etc. on P9 */
-       HMT_MEDIUM
-       bl      kvmhv_p9_set_lpcr
-       nop
-       b       kvm_no_guest
-
-kvmhv_do_restore:
-       HMT_MEDIUM
-       bl      kvmhv_p9_restore_lpcr
-       nop
-       b       kvm_no_guest
-
 /*
  * Here the primary thread is trying to return the core to
  * whole-core mode, so we need to nap.
@@ -527,7 +498,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
        /* Set kvm_split_mode.napped[tid] = 1 */
        ld      r3, HSTATE_SPLIT_MODE(r13)
        li      r0, 1
-       lbz     r4, HSTATE_TID(r13)
+       lhz     r4, PACAPACAINDEX(r13)
+       clrldi  r4, r4, 61      /* micro-threading => P8 => 8 threads/core */
        addi    r4, r4, KVM_SPLIT_NAPPED
        stbx    r0, r3, r4
        /* Check the do_nap flag again after setting napped[] */
@@ -711,10 +683,16 @@ BEGIN_FTR_SECTION
        mfspr   r7, SPRN_DAWRX0
        mfspr   r8, SPRN_IAMR
        std     r5, STACK_SLOT_CIABR(r1)
-       std     r6, STACK_SLOT_DAWR(r1)
-       std     r7, STACK_SLOT_DAWRX(r1)
+       std     r6, STACK_SLOT_DAWR0(r1)
+       std     r7, STACK_SLOT_DAWRX0(r1)
        std     r8, STACK_SLOT_IAMR(r1)
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+BEGIN_FTR_SECTION
+       mfspr   r6, SPRN_DAWR1
+       mfspr   r7, SPRN_DAWRX1
+       std     r6, STACK_SLOT_DAWR1(r1)
+       std     r7, STACK_SLOT_DAWRX1(r1)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S | CPU_FTR_DAWR1)
 
        mfspr   r5, SPRN_AMR
        std     r5, STACK_SLOT_AMR(r1)
@@ -801,10 +779,16 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
        lbz     r5, 0(r5)
        cmpdi   r5, 0
        beq     1f
-       ld      r5, VCPU_DAWR(r4)
-       ld      r6, VCPU_DAWRX(r4)
+       ld      r5, VCPU_DAWR0(r4)
+       ld      r6, VCPU_DAWRX0(r4)
        mtspr   SPRN_DAWR0, r5
        mtspr   SPRN_DAWRX0, r6
+BEGIN_FTR_SECTION
+       ld      r5, VCPU_DAWR1(r4)
+       ld      r6, VCPU_DAWRX1(r4)
+       mtspr   SPRN_DAWR1, r5
+       mtspr   SPRN_DAWRX1, r6
+END_FTR_SECTION_IFSET(CPU_FTR_DAWR1)
 1:
        ld      r7, VCPU_CIABR(r4)
        ld      r8, VCPU_TAR(r4)
@@ -918,15 +902,19 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
        cmpdi   r3, 512         /* 1 microsecond */
        blt     hdec_soon
 
-       /* For hash guest, clear out and reload the SLB */
        ld      r6, VCPU_KVM(r4)
        lbz     r0, KVM_RADIX(r6)
        cmpwi   r0, 0
        bne     9f
+
+       /* For hash guest, clear out and reload the SLB */
+BEGIN_MMU_FTR_SECTION
+       /* Radix host won't have populated the SLB, so no need to clear */
        li      r6, 0
        slbmte  r6, r6
-       slbia
+       PPC_SLBIA(6)
        ptesync
+END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX)
 
        /* Load up guest SLB entries (N.B. slb_max will be 0 for radix) */
        lwz     r5,VCPU_SLB_MAX(r4)
@@ -1187,6 +1175,20 @@ EXPORT_SYMBOL_GPL(__kvmhv_vcpu_entry_p9)
        mr      r4, r3
        b       fast_guest_entry_c
 guest_exit_short_path:
+       /*
+        * Malicious or buggy radix guests may have inserted SLB entries
+        * (only 0..3 because radix always runs with UPRT=1), so these must
+        * be cleared here to avoid side-channels. slbmte is used rather
+        * than slbia, as it won't clear cached translations.
+        */
+       li      r0,0
+       slbmte  r0,r0
+       li      r4,1
+       slbmte  r0,r4
+       li      r4,2
+       slbmte  r0,r4
+       li      r4,3
+       slbmte  r0,r4
 
        li      r0, KVM_GUEST_MODE_NONE
        stb     r0, HSTATE_IN_GUEST(r13)
@@ -1499,7 +1501,7 @@ guest_exit_cont:          /* r9 = vcpu, r12 = trap, r13 = paca */
        lbz     r0, KVM_RADIX(r5)
        li      r5, 0
        cmpwi   r0, 0
-       bne     3f                      /* for radix, save 0 entries */
+       bne     0f                      /* for radix, save 0 entries */
        lwz     r0,VCPU_SLB_NR(r9)      /* number of entries in SLB */
        mtctr   r0
        li      r6,0
@@ -1518,13 +1520,13 @@ guest_exit_cont:                /* r9 = vcpu, r12 = trap, r13 = paca */
        /* Finally clear out the SLB */
        li      r0,0
        slbmte  r0,r0
-       slbia
+       PPC_SLBIA(6)
        ptesync
-3:     stw     r5,VCPU_SLB_MAX(r9)
+       stw     r5,VCPU_SLB_MAX(r9)
 
        /* load host SLB entries */
 BEGIN_MMU_FTR_SECTION
-       b       0f
+       b       guest_bypass
 END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX)
        ld      r8,PACA_SLBSHADOWPTR(r13)
 
@@ -1538,7 +1540,21 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX)
        slbmte  r6,r5
 1:     addi    r8,r8,16
        .endr
-0:
+       b       guest_bypass
+
+0:     /*
+        * Sanitise radix guest SLB, see guest_exit_short_path comment.
+        * We clear vcpu->arch.slb_max to match earlier behaviour.
+        */
+       li      r0,0
+       stw     r0,VCPU_SLB_MAX(r9)
+       slbmte  r0,r0
+       li      r4,1
+       slbmte  r0,r4
+       li      r4,2
+       slbmte  r0,r4
+       li      r4,3
+       slbmte  r0,r4
 
 guest_bypass:
        stw     r12, STACK_SLOT_TRAP(r1)
@@ -1759,8 +1775,8 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
        /* Restore host values of some registers */
 BEGIN_FTR_SECTION
        ld      r5, STACK_SLOT_CIABR(r1)
-       ld      r6, STACK_SLOT_DAWR(r1)
-       ld      r7, STACK_SLOT_DAWRX(r1)
+       ld      r6, STACK_SLOT_DAWR0(r1)
+       ld      r7, STACK_SLOT_DAWRX0(r1)
        mtspr   SPRN_CIABR, r5
        /*
         * If the DAWR doesn't work, it's ok to write these here as
@@ -1769,6 +1785,12 @@ BEGIN_FTR_SECTION
        mtspr   SPRN_DAWR0, r6
        mtspr   SPRN_DAWRX0, r7
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+BEGIN_FTR_SECTION
+       ld      r6, STACK_SLOT_DAWR1(r1)
+       ld      r7, STACK_SLOT_DAWRX1(r1)
+       mtspr   SPRN_DAWR1, r6
+       mtspr   SPRN_DAWRX1, r7
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S | CPU_FTR_DAWR1)
 BEGIN_FTR_SECTION
        ld      r5, STACK_SLOT_TID(r1)
        ld      r6, STACK_SLOT_PSSCR(r1)
@@ -1938,24 +1960,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 19:    lis     r8,0x7fff               /* MAX_INT@h */
        mtspr   SPRN_HDEC,r8
 
-16:
-BEGIN_FTR_SECTION
-       /* On POWER9 with HPT-on-radix we need to wait for all other threads */
-       ld      r3, HSTATE_SPLIT_MODE(r13)
-       cmpdi   r3, 0
-       beq     47f
-       lwz     r8, KVM_SPLIT_DO_RESTORE(r3)
-       cmpwi   r8, 0
-       beq     47f
-       bl      kvmhv_p9_restore_lpcr
-       nop
-       b       48f
-47:
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
-       ld      r8,KVM_HOST_LPCR(r4)
+16:    ld      r8,KVM_HOST_LPCR(r4)
        mtspr   SPRN_LPCR,r8
        isync
-48:
+
 #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
        /* Finish timing, if we have a vcpu */
        ld      r4, HSTATE_KVM_VCPU(r13)
@@ -2574,8 +2582,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
        rlwimi  r5, r4, 5, DAWRX_DR | DAWRX_DW
        rlwimi  r5, r4, 2, DAWRX_WT
        clrrdi  r4, r4, 3
-       std     r4, VCPU_DAWR(r3)
-       std     r5, VCPU_DAWRX(r3)
+       std     r4, VCPU_DAWR0(r3)
+       std     r5, VCPU_DAWRX0(r3)
        /*
         * If came in through the real mode hcall handler then it is necessary
         * to write the registers since the return path won't. Otherwise it is
@@ -2779,8 +2787,10 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
        beq     kvm_end_cede
        cmpwi   r0, NAPPING_NOVCPU
        beq     kvm_novcpu_wakeup
+BEGIN_FTR_SECTION
        cmpwi   r0, NAPPING_UNSPLIT
        beq     kvm_unsplit_wakeup
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
        twi     31,0,0 /* Nap state must not be zero */
 
 33:    mr      r4, r3
@@ -3343,13 +3353,18 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
        mtspr   SPRN_IAMR, r0
        mtspr   SPRN_CIABR, r0
        mtspr   SPRN_DAWRX0, r0
+BEGIN_FTR_SECTION
+       mtspr   SPRN_DAWRX1, r0
+END_FTR_SECTION_IFSET(CPU_FTR_DAWR1)
+
+       /* Clear hash and radix guest SLB, see guest_exit_short_path comment. */
+       slbmte  r0, r0
+       PPC_SLBIA(6)
 
 BEGIN_MMU_FTR_SECTION
        b       4f
 END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX)
 
-       slbmte  r0, r0
-       slbia
        ptesync
        ld      r8, PACA_SLBSHADOWPTR(r13)
        .rept   SLB_NUM_BOLTED
index 288a982..f38ae3e 100644 (file)
@@ -698,7 +698,7 @@ int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu)
 
                kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS);
                r = 1;
-       };
+       }
 
        return r;
 }
index cf52d26..6c083a9 100644 (file)
@@ -611,8 +611,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
                r = !!(hv_enabled && radix_enabled());
                break;
        case KVM_CAP_PPC_MMU_HASH_V3:
-               r = !!(hv_enabled && cpu_has_feature(CPU_FTR_ARCH_300) &&
-                      cpu_has_feature(CPU_FTR_HVMODE));
+               r = !!(hv_enabled && kvmppc_hv_ops->hash_v3_possible &&
+                      kvmppc_hv_ops->hash_v3_possible());
                break;
        case KVM_CAP_PPC_NESTED_HV:
                r = !!(hv_enabled && kvmppc_hv_ops->enable_nested &&
@@ -678,6 +678,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
                r = hv_enabled && kvmppc_hv_ops->enable_svm &&
                        !kvmppc_hv_ops->enable_svm(NULL);
                break;
+       case KVM_CAP_PPC_DAWR1:
+               r = !!(hv_enabled && kvmppc_hv_ops->enable_dawr1 &&
+                      !kvmppc_hv_ops->enable_dawr1(NULL));
+               break;
 #endif
        default:
                r = 0;
@@ -2187,6 +2191,12 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
                        break;
                r = kvm->arch.kvm_ops->enable_svm(kvm);
                break;
+       case KVM_CAP_PPC_DAWR1:
+               r = -EINVAL;
+               if (!is_kvmppc_hv_enabled(kvm) || !kvm->arch.kvm_ops->enable_dawr1)
+                       break;
+               r = kvm->arch.kvm_ops->enable_dawr1(kvm);
+               break;
 #endif
        default:
                r = -EINVAL;
index 74f9a03..6bcfc56 100644 (file)
@@ -28,7 +28,6 @@
 #define KVM_S390_BSCA_CPU_SLOTS 64
 #define KVM_S390_ESCA_CPU_SLOTS 248
 #define KVM_MAX_VCPUS 255
-#define KVM_USER_MEM_SLOTS 32
 
 /*
  * These seem to be used for allocating ->chip in the routing table, which we
index 18f2d10..474617b 100644 (file)
@@ -170,7 +170,7 @@ SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr,
        if (!(vma->vm_flags & VM_WRITE))
                goto out_unlock_mmap;
 
-       ret = follow_pte(vma->vm_mm, mmio_addr, NULL, &ptep, NULL, &ptl);
+       ret = follow_pte(vma->vm_mm, mmio_addr, &ptep, &ptl);
        if (ret)
                goto out_unlock_mmap;
 
@@ -311,7 +311,7 @@ SYSCALL_DEFINE3(s390_pci_mmio_read, unsigned long, mmio_addr,
        if (!(vma->vm_flags & VM_WRITE))
                goto out_unlock_mmap;
 
-       ret = follow_pte(vma->vm_mm, mmio_addr, NULL, &ptep, NULL, &ptl);
+       ret = follow_pte(vma->vm_mm, mmio_addr, &ptep, &ptl);
        if (ret)
                goto out_unlock_mmap;
 
index 7fc82a2..3a9a0b0 100644 (file)
@@ -11,8 +11,8 @@
 
 #include <asm/processor.h>
 #include <asm/barrier.h>
-#include <asm/qrwlock.h>
 #include <asm/qspinlock.h>
+#include <asm/qrwlock.h>
 
 #endif /* !(__ASSEMBLY__) */
 
index 84b8878..f1957b3 100644 (file)
 #define X86_FEATURE_PER_THREAD_MBA     (11*32+ 7) /* "" Per-thread Memory Bandwidth Allocation */
 
 /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
+#define X86_FEATURE_AVX_VNNI           (12*32+ 4) /* AVX VNNI instructions */
 #define X86_FEATURE_AVX512_BF16                (12*32+ 5) /* AVX512 BFLOAT16 instructions */
 
 /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */
 #define X86_FEATURE_AVIC               (15*32+13) /* Virtual Interrupt Controller */
 #define X86_FEATURE_V_VMSAVE_VMLOAD    (15*32+15) /* Virtual VMSAVE VMLOAD */
 #define X86_FEATURE_VGIF               (15*32+16) /* Virtual GIF */
+#define X86_FEATURE_SVME_ADDR_CHK      (15*32+28) /* "" SVME addr check */
 
 /* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */
 #define X86_FEATURE_AVX512VBMI         (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
new file mode 100644 (file)
index 0000000..355a2ab
--- /dev/null
@@ -0,0 +1,127 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#if !defined(KVM_X86_OP) || !defined(KVM_X86_OP_NULL)
+BUILD_BUG_ON(1)
+#endif
+
+/*
+ * KVM_X86_OP() and KVM_X86_OP_NULL() are used to help generate
+ * "static_call()"s. They are also intended for use when defining
+ * the vmx/svm kvm_x86_ops. KVM_X86_OP() can be used for those
+ * functions that follow the [svm|vmx]_func_name convention.
+ * KVM_X86_OP_NULL() can leave a NULL definition for the
+ * case where there is no definition or a function name that
+ * doesn't match the typical naming convention is supplied.
+ */
+KVM_X86_OP_NULL(hardware_enable)
+KVM_X86_OP_NULL(hardware_disable)
+KVM_X86_OP_NULL(hardware_unsetup)
+KVM_X86_OP_NULL(cpu_has_accelerated_tpr)
+KVM_X86_OP(has_emulated_msr)
+KVM_X86_OP(vcpu_after_set_cpuid)
+KVM_X86_OP(vm_init)
+KVM_X86_OP_NULL(vm_destroy)
+KVM_X86_OP(vcpu_create)
+KVM_X86_OP(vcpu_free)
+KVM_X86_OP(vcpu_reset)
+KVM_X86_OP(prepare_guest_switch)
+KVM_X86_OP(vcpu_load)
+KVM_X86_OP(vcpu_put)
+KVM_X86_OP(update_exception_bitmap)
+KVM_X86_OP(get_msr)
+KVM_X86_OP(set_msr)
+KVM_X86_OP(get_segment_base)
+KVM_X86_OP(get_segment)
+KVM_X86_OP(get_cpl)
+KVM_X86_OP(set_segment)
+KVM_X86_OP_NULL(get_cs_db_l_bits)
+KVM_X86_OP(set_cr0)
+KVM_X86_OP(is_valid_cr4)
+KVM_X86_OP(set_cr4)
+KVM_X86_OP(set_efer)
+KVM_X86_OP(get_idt)
+KVM_X86_OP(set_idt)
+KVM_X86_OP(get_gdt)
+KVM_X86_OP(set_gdt)
+KVM_X86_OP(sync_dirty_debug_regs)
+KVM_X86_OP(set_dr7)
+KVM_X86_OP(cache_reg)
+KVM_X86_OP(get_rflags)
+KVM_X86_OP(set_rflags)
+KVM_X86_OP(tlb_flush_all)
+KVM_X86_OP(tlb_flush_current)
+KVM_X86_OP_NULL(tlb_remote_flush)
+KVM_X86_OP_NULL(tlb_remote_flush_with_range)
+KVM_X86_OP(tlb_flush_gva)
+KVM_X86_OP(tlb_flush_guest)
+KVM_X86_OP(run)
+KVM_X86_OP_NULL(handle_exit)
+KVM_X86_OP_NULL(skip_emulated_instruction)
+KVM_X86_OP_NULL(update_emulated_instruction)
+KVM_X86_OP(set_interrupt_shadow)
+KVM_X86_OP(get_interrupt_shadow)
+KVM_X86_OP(patch_hypercall)
+KVM_X86_OP(set_irq)
+KVM_X86_OP(set_nmi)
+KVM_X86_OP(queue_exception)
+KVM_X86_OP(cancel_injection)
+KVM_X86_OP(interrupt_allowed)
+KVM_X86_OP(nmi_allowed)
+KVM_X86_OP(get_nmi_mask)
+KVM_X86_OP(set_nmi_mask)
+KVM_X86_OP(enable_nmi_window)
+KVM_X86_OP(enable_irq_window)
+KVM_X86_OP(update_cr8_intercept)
+KVM_X86_OP(check_apicv_inhibit_reasons)
+KVM_X86_OP_NULL(pre_update_apicv_exec_ctrl)
+KVM_X86_OP(refresh_apicv_exec_ctrl)
+KVM_X86_OP(hwapic_irr_update)
+KVM_X86_OP(hwapic_isr_update)
+KVM_X86_OP_NULL(guest_apic_has_interrupt)
+KVM_X86_OP(load_eoi_exitmap)
+KVM_X86_OP(set_virtual_apic_mode)
+KVM_X86_OP_NULL(set_apic_access_page_addr)
+KVM_X86_OP(deliver_posted_interrupt)
+KVM_X86_OP_NULL(sync_pir_to_irr)
+KVM_X86_OP(set_tss_addr)
+KVM_X86_OP(set_identity_map_addr)
+KVM_X86_OP(get_mt_mask)
+KVM_X86_OP(load_mmu_pgd)
+KVM_X86_OP_NULL(has_wbinvd_exit)
+KVM_X86_OP(write_l1_tsc_offset)
+KVM_X86_OP(get_exit_info)
+KVM_X86_OP(check_intercept)
+KVM_X86_OP(handle_exit_irqoff)
+KVM_X86_OP_NULL(request_immediate_exit)
+KVM_X86_OP(sched_in)
+KVM_X86_OP_NULL(slot_enable_log_dirty)
+KVM_X86_OP_NULL(slot_disable_log_dirty)
+KVM_X86_OP_NULL(flush_log_dirty)
+KVM_X86_OP_NULL(enable_log_dirty_pt_masked)
+KVM_X86_OP_NULL(cpu_dirty_log_size)
+KVM_X86_OP_NULL(pre_block)
+KVM_X86_OP_NULL(post_block)
+KVM_X86_OP_NULL(vcpu_blocking)
+KVM_X86_OP_NULL(vcpu_unblocking)
+KVM_X86_OP_NULL(update_pi_irte)
+KVM_X86_OP_NULL(apicv_post_state_restore)
+KVM_X86_OP_NULL(dy_apicv_has_pending_interrupt)
+KVM_X86_OP_NULL(set_hv_timer)
+KVM_X86_OP_NULL(cancel_hv_timer)
+KVM_X86_OP(setup_mce)
+KVM_X86_OP(smi_allowed)
+KVM_X86_OP(pre_enter_smm)
+KVM_X86_OP(pre_leave_smm)
+KVM_X86_OP(enable_smi_window)
+KVM_X86_OP_NULL(mem_enc_op)
+KVM_X86_OP_NULL(mem_enc_reg_region)
+KVM_X86_OP_NULL(mem_enc_unreg_region)
+KVM_X86_OP(get_msr_feature)
+KVM_X86_OP(can_emulate_instruction)
+KVM_X86_OP(apic_init_signal_blocked)
+KVM_X86_OP_NULL(enable_direct_tlbflush)
+KVM_X86_OP_NULL(migrate_timers)
+KVM_X86_OP(msr_filter_changed)
+KVM_X86_OP_NULL(complete_emulated_msr)
+
+#undef KVM_X86_OP
+#undef KVM_X86_OP_NULL
index 3d6616f..84499aa 100644 (file)
 #define KVM_MAX_VCPUS 288
 #define KVM_SOFT_MAX_VCPUS 240
 #define KVM_MAX_VCPU_ID 1023
-#define KVM_USER_MEM_SLOTS 509
 /* memory slots that are not exposed to userspace */
 #define KVM_PRIVATE_MEM_SLOTS 3
-#define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS)
 
 #define KVM_HALT_POLL_NS_DEFAULT 200000
 
@@ -52,6 +50,9 @@
 #define KVM_DIRTY_LOG_MANUAL_CAPS   (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \
                                        KVM_DIRTY_LOG_INITIALLY_SET)
 
+#define KVM_BUS_LOCK_DETECTION_VALID_MODE      (KVM_BUS_LOCK_DETECTION_OFF | \
+                                                KVM_BUS_LOCK_DETECTION_EXIT)
+
 /* x86-specific vcpu->requests bit members */
 #define KVM_REQ_MIGRATE_TIMER          KVM_ARCH_REQ(0)
 #define KVM_REQ_REPORT_TPR_ACCESS      KVM_ARCH_REQ(1)
@@ -200,9 +201,17 @@ enum x86_intercept_stage;
 #define DR6_BS         (1 << 14)
 #define DR6_BT         (1 << 15)
 #define DR6_RTM                (1 << 16)
-#define DR6_FIXED_1    0xfffe0ff0
-#define DR6_INIT       0xffff0ff0
+/*
+ * DR6_ACTIVE_LOW combines fixed-1 and active-low bits.
+ * We can regard all the bits in DR6_FIXED_1 as active_low bits;
+ * they will never be 0 for now, but when they are defined
+ * in the future it will require no code change.
+ *
+ * DR6_ACTIVE_LOW is also used as the init/reset value for DR6.
+ */
+#define DR6_ACTIVE_LOW 0xffff0ff0
 #define DR6_VOLATILE   0x0001e00f
+#define DR6_FIXED_1    (DR6_ACTIVE_LOW & ~DR6_VOLATILE)
 
 #define DR7_BP_EN_MASK 0x000000ff
 #define DR7_GE         (1 << 9)
@@ -337,6 +346,8 @@ struct kvm_mmu_root_info {
 
 #define KVM_MMU_NUM_PREV_ROOTS 3
 
+#define KVM_HAVE_MMU_RWLOCK
+
 struct kvm_mmu_page;
 
 /*
@@ -358,8 +369,6 @@ struct kvm_mmu {
        int (*sync_page)(struct kvm_vcpu *vcpu,
                         struct kvm_mmu_page *sp);
        void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa);
-       void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
-                          u64 *spte, const void *pte);
        hpa_t root_hpa;
        gpa_t root_pgd;
        union kvm_mmu_role mmu_role;
@@ -510,6 +519,7 @@ struct kvm_vcpu_hv_synic {
 
 /* Hyper-V per vcpu emulation context */
 struct kvm_vcpu_hv {
+       struct kvm_vcpu *vcpu;
        u32 vp_index;
        u64 hv_vapic;
        s64 runtime_offset;
@@ -520,6 +530,15 @@ struct kvm_vcpu_hv {
        cpumask_t tlb_flush;
 };
 
+/* Xen HVM per vcpu emulation context */
+struct kvm_vcpu_xen {
+       u64 hypercall_rip;
+       bool vcpu_info_set;
+       bool vcpu_time_info_set;
+       struct gfn_to_hva_cache vcpu_info_cache;
+       struct gfn_to_hva_cache vcpu_time_info_cache;
+};
+
 struct kvm_vcpu_arch {
        /*
         * rip and regs accesses must go through
@@ -640,7 +659,7 @@ struct kvm_vcpu_arch {
        int cpuid_nent;
        struct kvm_cpuid_entry2 *cpuid_entries;
 
-       unsigned long cr3_lm_rsvd_bits;
+       u64 reserved_gpa_bits;
        int maxphyaddr;
        int max_tdp_level;
 
@@ -717,7 +736,9 @@ struct kvm_vcpu_arch {
        /* used for guest single stepping over the given code position */
        unsigned long singlestep_rip;
 
-       struct kvm_vcpu_hv hyperv;
+       bool hyperv_enabled;
+       struct kvm_vcpu_hv *hyperv;
+       struct kvm_vcpu_xen xen;
 
        cpumask_var_t wbinvd_dirty_mask;
 
@@ -888,6 +909,14 @@ struct msr_bitmap_range {
        unsigned long *bitmap;
 };
 
+/* Xen emulation context */
+struct kvm_xen {
+       bool long_mode;
+       bool shinfo_set;
+       u8 upcall_vector;
+       struct gfn_to_hva_cache shinfo_cache;
+};
+
 enum kvm_irqchip_mode {
        KVM_IRQCHIP_NONE,
        KVM_IRQCHIP_KERNEL,       /* created with KVM_CREATE_IRQCHIP */
@@ -967,6 +996,7 @@ struct kvm_arch {
        struct hlist_head mask_notifier_list;
 
        struct kvm_hv hyperv;
+       struct kvm_xen xen;
 
        #ifdef CONFIG_KVM_MMU_AUDIT
        int audit_point;
@@ -998,9 +1028,12 @@ struct kvm_arch {
                struct msr_bitmap_range ranges[16];
        } msr_filter;
 
+       bool bus_lock_detection_enabled;
+
        struct kvm_pmu_event_filter *pmu_event_filter;
        struct task_struct *nx_lpage_recovery_thread;
 
+#ifdef CONFIG_X86_64
        /*
         * Whether the TDP MMU is enabled for this VM. This contains a
         * snapshot of the TDP MMU module parameter from when the VM was
@@ -1026,12 +1059,25 @@ struct kvm_arch {
         * tdp_mmu_page set and a root_count of 0.
         */
        struct list_head tdp_mmu_pages;
+
+       /*
+        * Protects accesses to the following fields when the MMU lock
+        * is held in read mode:
+        *  - tdp_mmu_pages (above)
+        *  - the link field of struct kvm_mmu_pages used by the TDP MMU
+        *  - lpage_disallowed_mmu_pages
+        *  - the lpage_disallowed_link field of struct kvm_mmu_pages used
+        *    by the TDP MMU
+        * It is acceptable, but not necessary, to acquire this lock when
+        * the thread holds the MMU lock in write mode.
+        */
+       spinlock_t tdp_mmu_pages_lock;
+#endif /* CONFIG_X86_64 */
 };
 
 struct kvm_vm_stat {
        ulong mmu_shadow_zapped;
        ulong mmu_pte_write;
-       ulong mmu_pte_updated;
        ulong mmu_pde_zapped;
        ulong mmu_flooded;
        ulong mmu_recycled;
@@ -1340,6 +1386,19 @@ extern u64 __read_mostly host_efer;
 extern bool __read_mostly allow_smaller_maxphyaddr;
 extern struct kvm_x86_ops kvm_x86_ops;
 
+#define KVM_X86_OP(func) \
+       DECLARE_STATIC_CALL(kvm_x86_##func, *(((struct kvm_x86_ops *)0)->func));
+#define KVM_X86_OP_NULL KVM_X86_OP
+#include <asm/kvm-x86-ops.h>
+
+static inline void kvm_ops_static_call_update(void)
+{
+#define KVM_X86_OP(func) \
+       static_call_update(kvm_x86_##func, kvm_x86_ops.func);
+#define KVM_X86_OP_NULL KVM_X86_OP
+#include <asm/kvm-x86-ops.h>
+}
+
 #define __KVM_HAVE_ARCH_VM_ALLOC
 static inline struct kvm *kvm_arch_alloc_vm(void)
 {
@@ -1351,7 +1410,7 @@ void kvm_arch_free_vm(struct kvm *kvm);
 static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm)
 {
        if (kvm_x86_ops.tlb_remote_flush &&
-           !kvm_x86_ops.tlb_remote_flush(kvm))
+           !static_call(kvm_x86_tlb_remote_flush)(kvm))
                return 0;
        else
                return -ENOTSUPP;
@@ -1421,6 +1480,8 @@ extern u8   kvm_tsc_scaling_ratio_frac_bits;
 extern u64  kvm_max_tsc_scaling_ratio;
 /* 1ull << kvm_tsc_scaling_ratio_frac_bits */
 extern u64  kvm_default_tsc_scaling_ratio;
+/* bus lock detection supported? */
+extern bool kvm_has_bus_lock_exit;
 
 extern u64 kvm_mce_cap_supported;
 
@@ -1501,7 +1562,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
 int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
 int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
 int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
-int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
+void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
@@ -1742,14 +1803,12 @@ static inline bool kvm_irq_is_postable(struct kvm_lapic_irq *irq)
 
 static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
 {
-       if (kvm_x86_ops.vcpu_blocking)
-               kvm_x86_ops.vcpu_blocking(vcpu);
+       static_call_cond(kvm_x86_vcpu_blocking)(vcpu);
 }
 
 static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu)
 {
-       if (kvm_x86_ops.vcpu_unblocking)
-               kvm_x86_ops.vcpu_unblocking(vcpu);
+       static_call_cond(kvm_x86_vcpu_unblocking)(vcpu);
 }
 
 static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
index 9aad0e0..8757078 100644 (file)
@@ -30,16 +30,29 @@ static inline int cpu_has_vmx(void)
 }
 
 
-/** Disable VMX on the current CPU
+/**
+ * cpu_vmxoff() - Disable VMX on the current CPU
  *
- * vmxoff causes a undefined-opcode exception if vmxon was not run
- * on the CPU previously. Only call this function if you know VMX
- * is enabled.
+ * Disable VMX and clear CR4.VMXE (even if VMXOFF faults)
+ *
+ * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to
+ * atomically track post-VMXON state, e.g. this may be called in NMI context.
+ * Eat all faults as all other faults on VMXOFF faults are mode related, i.e.
+ * faults are guaranteed to be due to the !post-VMXON check unless the CPU is
+ * magically in RM, VM86, compat mode, or at CPL>0.
  */
-static inline void cpu_vmxoff(void)
+static inline int cpu_vmxoff(void)
 {
-       asm volatile ("vmxoff");
+       asm_volatile_goto("1: vmxoff\n\t"
+                         _ASM_EXTABLE(1b, %l[fault])
+                         ::: "cc", "memory" : fault);
+
+       cr4_clear_bits(X86_CR4_VMXE);
+       return 0;
+
+fault:
        cr4_clear_bits(X86_CR4_VMXE);
+       return -EIO;
 }
 
 static inline int cpu_vmx_enabled(void)
index 38ca445..358707f 100644 (file)
@@ -73,6 +73,7 @@
 #define SECONDARY_EXEC_PT_USE_GPA              VMCS_CONTROL_BIT(PT_USE_GPA)
 #define SECONDARY_EXEC_TSC_SCALING              VMCS_CONTROL_BIT(TSC_SCALING)
 #define SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE   VMCS_CONTROL_BIT(USR_WAIT_PAUSE)
+#define SECONDARY_EXEC_BUS_LOCK_DETECTION      VMCS_CONTROL_BIT(BUS_LOCK_DETECTION)
 
 #define PIN_BASED_EXT_INTR_MASK                 VMCS_CONTROL_BIT(INTR_EXITING)
 #define PIN_BASED_NMI_EXITING                   VMCS_CONTROL_BIT(NMI_EXITING)
index 9915990..d9a7468 100644 (file)
@@ -83,5 +83,6 @@
 #define VMX_FEATURE_TSC_SCALING                ( 2*32+ 25) /* Scale hardware TSC when read in guest */
 #define VMX_FEATURE_USR_WAIT_PAUSE     ( 2*32+ 26) /* Enable TPAUSE, UMONITOR, UMWAIT in guest */
 #define VMX_FEATURE_ENCLV_EXITING      ( 2*32+ 28) /* "" VM-Exit on ENCLV (leaf dependent) */
+#define VMX_FEATURE_BUS_LOCK_DETECTION ( 2*32+ 30) /* "" VM-Exit when bus lock caused */
 
 #endif /* _ASM_X86_VMXFEATURES_H */
index 9139b3e..baca0b0 100644 (file)
@@ -182,6 +182,9 @@ struct arch_shared_info {
        unsigned long p2m_cr3;          /* cr3 value of the p2m address space */
        unsigned long p2m_vaddr;        /* virtual address of the p2m list */
        unsigned long p2m_generation;   /* generation count of p2m mapping */
+#ifdef CONFIG_X86_32
+       uint32_t wc_sec_hi;
+#endif
 };
 #endif /* !__ASSEMBLY__ */
 
index 8e76d37..5a3022c 100644 (file)
@@ -112,6 +112,7 @@ struct kvm_ioapic_state {
 #define KVM_NR_IRQCHIPS          3
 
 #define KVM_RUN_X86_SMM                 (1 << 0)
+#define KVM_RUN_X86_BUS_LOCK     (1 << 1)
 
 /* for KVM_GET_REGS and KVM_SET_REGS */
 struct kvm_regs {
index ada955c..b8e650a 100644 (file)
@@ -89,6 +89,7 @@
 #define EXIT_REASON_XRSTORS             64
 #define EXIT_REASON_UMWAIT              67
 #define EXIT_REASON_TPAUSE              68
+#define EXIT_REASON_BUS_LOCK            74
 
 #define VMX_EXIT_REASONS \
        { EXIT_REASON_EXCEPTION_NMI,         "EXCEPTION_NMI" }, \
        { EXIT_REASON_XSAVES,                "XSAVES" }, \
        { EXIT_REASON_XRSTORS,               "XRSTORS" }, \
        { EXIT_REASON_UMWAIT,                "UMWAIT" }, \
-       { EXIT_REASON_TPAUSE,                "TPAUSE" }
+       { EXIT_REASON_TPAUSE,                "TPAUSE" }, \
+       { EXIT_REASON_BUS_LOCK,              "BUS_LOCK" }
 
 #define VMX_EXIT_REASON_FLAGS \
        { VMX_EXIT_REASONS_FAILED_VMENTRY,      "FAILED_VMENTRY" }
index 6bd20c0..dea2b44 100644 (file)
@@ -1743,6 +1743,7 @@ void apic_ap_setup(void)
 
 #ifdef CONFIG_X86_X2APIC
 int x2apic_mode;
+EXPORT_SYMBOL_GPL(x2apic_mode);
 
 enum {
        X2APIC_OFF,
index db11594..efbaef8 100644 (file)
@@ -538,31 +538,21 @@ static void emergency_vmx_disable_all(void)
        local_irq_disable();
 
        /*
-        * We need to disable VMX on all CPUs before rebooting, otherwise
-        * we risk hanging up the machine, because the CPU ignores INIT
-        * signals when VMX is enabled.
+        * Disable VMX on all CPUs before rebooting, otherwise we risk hanging
+        * the machine, because the CPU blocks INIT when it's in VMX root.
         *
-        * We can't take any locks and we may be on an inconsistent
-        * state, so we use NMIs as IPIs to tell the other CPUs to disable
-        * VMX and halt.
+        * We can't take any locks and we may be on an inconsistent state, so
+        * use NMIs as IPIs to tell the other CPUs to exit VMX root and halt.
         *
-        * For safety, we will avoid running the nmi_shootdown_cpus()
-        * stuff unnecessarily, but we don't have a way to check
-        * if other CPUs have VMX enabled. So we will call it only if the
-        * CPU we are running on has VMX enabled.
-        *
-        * We will miss cases where VMX is not enabled on all CPUs. This
-        * shouldn't do much harm because KVM always enable VMX on all
-        * CPUs anyway. But we can miss it on the small window where KVM
-        * is still enabling VMX.
+        * Do the NMI shootdown even if VMX if off on _this_ CPU, as that
+        * doesn't prevent a different CPU from being in VMX root operation.
         */
-       if (cpu_has_vmx() && cpu_vmx_enabled()) {
-               /* Disable VMX on this CPU. */
-               cpu_vmxoff();
+       if (cpu_has_vmx()) {
+               /* Safely force _this_ CPU out of VMX root operation. */
+               __cpu_emergency_vmxoff();
 
-               /* Halt and disable VMX on the other CPUs */
+               /* Halt and exit VMX root operation on the other CPUs. */
                nmi_shootdown_cpus(vmxoff_nmi);
-
        }
 }
 
index 4bd14ab..aeab168 100644 (file)
@@ -14,10 +14,11 @@ kvm-y                       += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
                                $(KVM)/dirty_ring.o
 kvm-$(CONFIG_KVM_ASYNC_PF)     += $(KVM)/async_pf.o
 
-kvm-y                  += x86.o emulate.o i8259.o irq.o lapic.o \
+kvm-y                  += x86.o emulate.o i8259.o irq.o lapic.o xen.o \
                           i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
                           hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o \
-                          mmu/spte.o mmu/tdp_iter.o mmu/tdp_mmu.o
+                          mmu/spte.o
+kvm-$(CONFIG_X86_64) += mmu/tdp_iter.o mmu/tdp_mmu.o
 
 kvm-intel-y            += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
                           vmx/evmcs.o vmx/nested.o vmx/posted_intr.o
index 13036cf..c8f2592 100644 (file)
@@ -173,16 +173,22 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
        kvm_update_pv_runtime(vcpu);
 
        vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
-       kvm_mmu_reset_context(vcpu);
+       vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
 
        kvm_pmu_refresh(vcpu);
        vcpu->arch.cr4_guest_rsvd_bits =
            __cr4_reserved_bits(guest_cpuid_has, vcpu);
 
-       vcpu->arch.cr3_lm_rsvd_bits = rsvd_bits(cpuid_maxphyaddr(vcpu), 63);
+       kvm_hv_set_cpuid(vcpu);
 
        /* Invoke the vendor callback only after the above state is updated. */
-       kvm_x86_ops.vcpu_after_set_cpuid(vcpu);
+       static_call(kvm_x86_vcpu_after_set_cpuid)(vcpu);
+
+       /*
+        * Except for the MMU, which needs to be reset after any vendor
+        * specific adjustments to the reserved GPA bits.
+        */
+       kvm_mmu_reset_context(vcpu);
 }
 
 static int is_efer_nx(void)
@@ -223,6 +229,16 @@ not_found:
        return 36;
 }
 
+/*
+ * This "raw" version returns the reserved GPA bits without any adjustments for
+ * encryption technologies that usurp bits.  The raw mask should be used if and
+ * only if hardware does _not_ strip the usurped bits, e.g. in virtual MTRRs.
+ */
+u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu)
+{
+       return rsvd_bits(cpuid_maxphyaddr(vcpu), 63);
+}
+
 /* when an old userspace process fills a new kernel module */
 int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
                             struct kvm_cpuid *cpuid,
@@ -321,7 +337,7 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
        if (cpuid->nent < vcpu->arch.cpuid_nent)
                goto out;
        r = -EFAULT;
-       if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
+       if (copy_to_user(entries, vcpu->arch.cpuid_entries,
                         vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
                goto out;
        return 0;
@@ -434,7 +450,7 @@ void kvm_set_cpu_caps(void)
                kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD);
 
        kvm_cpu_cap_mask(CPUID_7_1_EAX,
-               F(AVX512_BF16)
+               F(AVX_VNNI) | F(AVX512_BF16)
        );
 
        kvm_cpu_cap_mask(CPUID_D_1_EAX,
index dc921d7..2a0c506 100644 (file)
@@ -30,15 +30,32 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
               u32 *ecx, u32 *edx, bool exact_only);
 
 int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu);
+u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu);
 
 static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
 {
        return vcpu->arch.maxphyaddr;
 }
 
+static inline bool kvm_vcpu_is_legal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+       return !(gpa & vcpu->arch.reserved_gpa_bits);
+}
+
 static inline bool kvm_vcpu_is_illegal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
 {
-       return (gpa >= BIT_ULL(cpuid_maxphyaddr(vcpu)));
+       return !kvm_vcpu_is_legal_gpa(vcpu, gpa);
+}
+
+static inline bool kvm_vcpu_is_legal_aligned_gpa(struct kvm_vcpu *vcpu,
+                                                gpa_t gpa, gpa_t alignment)
+{
+       return IS_ALIGNED(gpa, alignment) && kvm_vcpu_is_legal_gpa(vcpu, gpa);
+}
+
+static inline bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+       return kvm_vcpu_is_legal_aligned_gpa(vcpu, gpa, PAGE_SIZE);
 }
 
 struct cpuid_reg {
@@ -324,11 +341,6 @@ static __always_inline void kvm_cpu_cap_check_and_set(unsigned int x86_feature)
                kvm_cpu_cap_set(x86_feature);
 }
 
-static inline bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa)
-{
-       return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu));
-}
-
 static __always_inline bool guest_pv_has(struct kvm_vcpu *vcpu,
                                         unsigned int kvm_feature)
 {
index 56cae1f..f7970ba 100644 (file)
@@ -2506,12 +2506,12 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt,
 
        val = GET_SMSTATE(u32, smstate, 0x7fcc);
 
-       if (ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1))
+       if (ctxt->ops->set_dr(ctxt, 6, val))
                return X86EMUL_UNHANDLEABLE;
 
        val = GET_SMSTATE(u32, smstate, 0x7fc8);
 
-       if (ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1))
+       if (ctxt->ops->set_dr(ctxt, 7, val))
                return X86EMUL_UNHANDLEABLE;
 
        selector =                 GET_SMSTATE(u32, smstate, 0x7fc4);
@@ -2564,14 +2564,14 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt,
        ctxt->_eip   = GET_SMSTATE(u64, smstate, 0x7f78);
        ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7f70) | X86_EFLAGS_FIXED;
 
-       val = GET_SMSTATE(u32, smstate, 0x7f68);
+       val = GET_SMSTATE(u64, smstate, 0x7f68);
 
-       if (ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1))
+       if (ctxt->ops->set_dr(ctxt, 6, val))
                return X86EMUL_UNHANDLEABLE;
 
-       val = GET_SMSTATE(u32, smstate, 0x7f60);
+       val = GET_SMSTATE(u64, smstate, 0x7f60);
 
-       if (ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1))
+       if (ctxt->ops->set_dr(ctxt, 7, val))
                return X86EMUL_UNHANDLEABLE;
 
        cr0 =                       GET_SMSTATE(u64, smstate, 0x7f58);
@@ -2879,6 +2879,8 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt)
        ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data);
        *reg_write(ctxt, VCPU_REGS_RSP) = (efer & EFER_LMA) ? msr_data :
                                                              (u32)msr_data;
+       if (efer & EFER_LMA)
+               ctxt->mode = X86EMUL_MODE_PROT64;
 
        return X86EMUL_CONTINUE;
 }
@@ -4327,7 +4329,7 @@ static int check_dr_read(struct x86_emulate_ctxt *ctxt)
 
                ctxt->ops->get_dr(ctxt, 6, &dr6);
                dr6 &= ~DR_TRAP_BITS;
-               dr6 |= DR6_BD | DR6_RTM;
+               dr6 |= DR6_BD | DR6_ACTIVE_LOW;
                ctxt->ops->set_dr(ctxt, 6, dr6);
                return emulate_db(ctxt);
        }
index 922c69d..7d2dae9 100644 (file)
@@ -23,6 +23,7 @@
 #include "ioapic.h"
 #include "cpuid.h"
 #include "hyperv.h"
+#include "xen.h"
 
 #include <linux/cpu.h>
 #include <linux/kvm_host.h>
@@ -36,6 +37,9 @@
 #include "trace.h"
 #include "irq.h"
 
+/* "Hv#1" signature */
+#define HYPERV_CPUID_SIGNATURE_EAX 0x31237648
+
 #define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, 64)
 
 static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer,
@@ -128,7 +132,7 @@ static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint,
        synic_update_vector(synic, vector);
 
        /* Load SynIC vectors into EOI exit bitmap */
-       kvm_make_request(KVM_REQ_SCAN_IOAPIC, synic_to_vcpu(synic));
+       kvm_make_request(KVM_REQ_SCAN_IOAPIC, hv_synic_to_vcpu(synic));
        return 0;
 }
 
@@ -141,10 +145,10 @@ static struct kvm_vcpu *get_vcpu_by_vpidx(struct kvm *kvm, u32 vpidx)
                return NULL;
 
        vcpu = kvm_get_vcpu(kvm, vpidx);
-       if (vcpu && vcpu_to_hv_vcpu(vcpu)->vp_index == vpidx)
+       if (vcpu && kvm_hv_get_vpindex(vcpu) == vpidx)
                return vcpu;
        kvm_for_each_vcpu(i, vcpu, kvm)
-               if (vcpu_to_hv_vcpu(vcpu)->vp_index == vpidx)
+               if (kvm_hv_get_vpindex(vcpu) == vpidx)
                        return vcpu;
        return NULL;
 }
@@ -157,15 +161,15 @@ static struct kvm_vcpu_hv_synic *synic_get(struct kvm *kvm, u32 vpidx)
        vcpu = get_vcpu_by_vpidx(kvm, vpidx);
        if (!vcpu)
                return NULL;
-       synic = vcpu_to_synic(vcpu);
+       synic = to_hv_synic(vcpu);
        return (synic->active) ? synic : NULL;
 }
 
 static void kvm_hv_notify_acked_sint(struct kvm_vcpu *vcpu, u32 sint)
 {
        struct kvm *kvm = vcpu->kvm;
-       struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
-       struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
+       struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu);
+       struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
        struct kvm_vcpu_hv_stimer *stimer;
        int gsi, idx;
 
@@ -189,8 +193,8 @@ static void kvm_hv_notify_acked_sint(struct kvm_vcpu *vcpu, u32 sint)
 
 static void synic_exit(struct kvm_vcpu_hv_synic *synic, u32 msr)
 {
-       struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
-       struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv;
+       struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic);
+       struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
 
        hv_vcpu->exit.type = KVM_EXIT_HYPERV_SYNIC;
        hv_vcpu->exit.u.synic.msr = msr;
@@ -204,7 +208,7 @@ static void synic_exit(struct kvm_vcpu_hv_synic *synic, u32 msr)
 static int synic_set_msr(struct kvm_vcpu_hv_synic *synic,
                         u32 msr, u64 data, bool host)
 {
-       struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
+       struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic);
        int ret;
 
        if (!synic->active && !host)
@@ -282,8 +286,7 @@ static bool kvm_hv_is_syndbg_enabled(struct kvm_vcpu *vcpu)
 
 static int kvm_hv_syndbg_complete_userspace(struct kvm_vcpu *vcpu)
 {
-       struct kvm *kvm = vcpu->kvm;
-       struct kvm_hv *hv = &kvm->arch.hyperv;
+       struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
 
        if (vcpu->run->hyperv.u.syndbg.msr == HV_X64_MSR_SYNDBG_CONTROL)
                hv->hv_syndbg.control.status =
@@ -293,8 +296,8 @@ static int kvm_hv_syndbg_complete_userspace(struct kvm_vcpu *vcpu)
 
 static void syndbg_exit(struct kvm_vcpu *vcpu, u32 msr)
 {
-       struct kvm_hv_syndbg *syndbg = vcpu_to_hv_syndbg(vcpu);
-       struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv;
+       struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu);
+       struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
 
        hv_vcpu->exit.type = KVM_EXIT_HYPERV_SYNDBG;
        hv_vcpu->exit.u.syndbg.msr = msr;
@@ -310,13 +313,13 @@ static void syndbg_exit(struct kvm_vcpu *vcpu, u32 msr)
 
 static int syndbg_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
 {
-       struct kvm_hv_syndbg *syndbg = vcpu_to_hv_syndbg(vcpu);
+       struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu);
 
        if (!kvm_hv_is_syndbg_enabled(vcpu) && !host)
                return 1;
 
        trace_kvm_hv_syndbg_set_msr(vcpu->vcpu_id,
-                                   vcpu_to_hv_vcpu(vcpu)->vp_index, msr, data);
+                                   to_hv_vcpu(vcpu)->vp_index, msr, data);
        switch (msr) {
        case HV_X64_MSR_SYNDBG_CONTROL:
                syndbg->control.control = data;
@@ -349,7 +352,7 @@ static int syndbg_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
 
 static int syndbg_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
 {
-       struct kvm_hv_syndbg *syndbg = vcpu_to_hv_syndbg(vcpu);
+       struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu);
 
        if (!kvm_hv_is_syndbg_enabled(vcpu) && !host)
                return 1;
@@ -377,9 +380,7 @@ static int syndbg_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
                break;
        }
 
-       trace_kvm_hv_syndbg_get_msr(vcpu->vcpu_id,
-                                   vcpu_to_hv_vcpu(vcpu)->vp_index, msr,
-                                   *pdata);
+       trace_kvm_hv_syndbg_get_msr(vcpu->vcpu_id, kvm_hv_get_vpindex(vcpu), msr, *pdata);
 
        return 0;
 }
@@ -421,7 +422,7 @@ static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata,
 
 static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint)
 {
-       struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
+       struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic);
        struct kvm_lapic_irq irq;
        int ret, vector;
 
@@ -457,7 +458,7 @@ int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vpidx, u32 sint)
 
 void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector)
 {
-       struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
+       struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu);
        int i;
 
        trace_kvm_hv_synic_send_eoi(vcpu->vcpu_id, vector);
@@ -514,7 +515,7 @@ static void synic_init(struct kvm_vcpu_hv_synic *synic)
 
 static u64 get_time_ref_counter(struct kvm *kvm)
 {
-       struct kvm_hv *hv = &kvm->arch.hyperv;
+       struct kvm_hv *hv = to_kvm_hv(kvm);
        struct kvm_vcpu *vcpu;
        u64 tsc;
 
@@ -534,10 +535,10 @@ static u64 get_time_ref_counter(struct kvm *kvm)
 static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer,
                                bool vcpu_kick)
 {
-       struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
+       struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer);
 
        set_bit(stimer->index,
-               vcpu_to_hv_vcpu(vcpu)->stimer_pending_bitmap);
+               to_hv_vcpu(vcpu)->stimer_pending_bitmap);
        kvm_make_request(KVM_REQ_HV_STIMER, vcpu);
        if (vcpu_kick)
                kvm_vcpu_kick(vcpu);
@@ -545,14 +546,14 @@ static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer,
 
 static void stimer_cleanup(struct kvm_vcpu_hv_stimer *stimer)
 {
-       struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
+       struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer);
 
-       trace_kvm_hv_stimer_cleanup(stimer_to_vcpu(stimer)->vcpu_id,
+       trace_kvm_hv_stimer_cleanup(hv_stimer_to_vcpu(stimer)->vcpu_id,
                                    stimer->index);
 
        hrtimer_cancel(&stimer->timer);
        clear_bit(stimer->index,
-                 vcpu_to_hv_vcpu(vcpu)->stimer_pending_bitmap);
+                 to_hv_vcpu(vcpu)->stimer_pending_bitmap);
        stimer->msg_pending = false;
        stimer->exp_time = 0;
 }
@@ -562,7 +563,7 @@ static enum hrtimer_restart stimer_timer_callback(struct hrtimer *timer)
        struct kvm_vcpu_hv_stimer *stimer;
 
        stimer = container_of(timer, struct kvm_vcpu_hv_stimer, timer);
-       trace_kvm_hv_stimer_callback(stimer_to_vcpu(stimer)->vcpu_id,
+       trace_kvm_hv_stimer_callback(hv_stimer_to_vcpu(stimer)->vcpu_id,
                                     stimer->index);
        stimer_mark_pending(stimer, true);
 
@@ -579,7 +580,7 @@ static int stimer_start(struct kvm_vcpu_hv_stimer *stimer)
        u64 time_now;
        ktime_t ktime_now;
 
-       time_now = get_time_ref_counter(stimer_to_vcpu(stimer)->kvm);
+       time_now = get_time_ref_counter(hv_stimer_to_vcpu(stimer)->kvm);
        ktime_now = ktime_get();
 
        if (stimer->config.periodic) {
@@ -596,7 +597,7 @@ static int stimer_start(struct kvm_vcpu_hv_stimer *stimer)
                        stimer->exp_time = time_now + stimer->count;
 
                trace_kvm_hv_stimer_start_periodic(
-                                       stimer_to_vcpu(stimer)->vcpu_id,
+                                       hv_stimer_to_vcpu(stimer)->vcpu_id,
                                        stimer->index,
                                        time_now, stimer->exp_time);
 
@@ -618,7 +619,7 @@ static int stimer_start(struct kvm_vcpu_hv_stimer *stimer)
                return 0;
        }
 
-       trace_kvm_hv_stimer_start_one_shot(stimer_to_vcpu(stimer)->vcpu_id,
+       trace_kvm_hv_stimer_start_one_shot(hv_stimer_to_vcpu(stimer)->vcpu_id,
                                           stimer->index,
                                           time_now, stimer->count);
 
@@ -633,13 +634,13 @@ static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config,
 {
        union hv_stimer_config new_config = {.as_uint64 = config},
                old_config = {.as_uint64 = stimer->config.as_uint64};
-       struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
-       struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
+       struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer);
+       struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu);
 
        if (!synic->active && !host)
                return 1;
 
-       trace_kvm_hv_stimer_set_config(stimer_to_vcpu(stimer)->vcpu_id,
+       trace_kvm_hv_stimer_set_config(hv_stimer_to_vcpu(stimer)->vcpu_id,
                                       stimer->index, config, host);
 
        stimer_cleanup(stimer);
@@ -657,13 +658,13 @@ static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config,
 static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count,
                            bool host)
 {
-       struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
-       struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
+       struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer);
+       struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu);
 
        if (!synic->active && !host)
                return 1;
 
-       trace_kvm_hv_stimer_set_count(stimer_to_vcpu(stimer)->vcpu_id,
+       trace_kvm_hv_stimer_set_count(hv_stimer_to_vcpu(stimer)->vcpu_id,
                                      stimer->index, count, host);
 
        stimer_cleanup(stimer);
@@ -694,7 +695,7 @@ static int stimer_get_count(struct kvm_vcpu_hv_stimer *stimer, u64 *pcount)
 static int synic_deliver_msg(struct kvm_vcpu_hv_synic *synic, u32 sint,
                             struct hv_message *src_msg, bool no_retry)
 {
-       struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
+       struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic);
        int msg_off = offsetof(struct hv_message_page, sint_message[sint]);
        gfn_t msg_page_gfn;
        struct hv_message_header hv_hdr;
@@ -750,7 +751,7 @@ static int synic_deliver_msg(struct kvm_vcpu_hv_synic *synic, u32 sint,
 
 static int stimer_send_msg(struct kvm_vcpu_hv_stimer *stimer)
 {
-       struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
+       struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer);
        struct hv_message *msg = &stimer->msg;
        struct hv_timer_message_payload *payload =
                        (struct hv_timer_message_payload *)&msg->u.payload;
@@ -763,14 +764,14 @@ static int stimer_send_msg(struct kvm_vcpu_hv_stimer *stimer)
 
        payload->expiration_time = stimer->exp_time;
        payload->delivery_time = get_time_ref_counter(vcpu->kvm);
-       return synic_deliver_msg(vcpu_to_synic(vcpu),
+       return synic_deliver_msg(to_hv_synic(vcpu),
                                 stimer->config.sintx, msg,
                                 no_retry);
 }
 
 static int stimer_notify_direct(struct kvm_vcpu_hv_stimer *stimer)
 {
-       struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
+       struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer);
        struct kvm_lapic_irq irq = {
                .delivery_mode = APIC_DM_FIXED,
                .vector = stimer->config.apic_vector
@@ -790,7 +791,7 @@ static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer)
                r = stimer_send_msg(stimer);
        else
                r = stimer_notify_direct(stimer);
-       trace_kvm_hv_stimer_expiration(stimer_to_vcpu(stimer)->vcpu_id,
+       trace_kvm_hv_stimer_expiration(hv_stimer_to_vcpu(stimer)->vcpu_id,
                                       stimer->index, direct, r);
        if (!r) {
                stimer->msg_pending = false;
@@ -801,11 +802,14 @@ static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer)
 
 void kvm_hv_process_stimers(struct kvm_vcpu *vcpu)
 {
-       struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
+       struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
        struct kvm_vcpu_hv_stimer *stimer;
        u64 time_now, exp_time;
        int i;
 
+       if (!hv_vcpu)
+               return;
+
        for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++)
                if (test_and_clear_bit(i, hv_vcpu->stimer_pending_bitmap)) {
                        stimer = &hv_vcpu->stimer[i];
@@ -831,16 +835,27 @@ void kvm_hv_process_stimers(struct kvm_vcpu *vcpu)
 
 void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu)
 {
-       struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
+       struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
        int i;
 
+       if (!hv_vcpu)
+               return;
+
        for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++)
                stimer_cleanup(&hv_vcpu->stimer[i]);
+
+       kfree(hv_vcpu);
+       vcpu->arch.hyperv = NULL;
 }
 
 bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu)
 {
-       if (!(vcpu->arch.hyperv.hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE))
+       struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+       if (!hv_vcpu)
+               return false;
+
+       if (!(hv_vcpu->hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE))
                return false;
        return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED;
 }
@@ -880,28 +895,41 @@ static void stimer_init(struct kvm_vcpu_hv_stimer *stimer, int timer_index)
        stimer_prepare_msg(stimer);
 }
 
-void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu)
+static int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu)
 {
-       struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
+       struct kvm_vcpu_hv *hv_vcpu;
        int i;
 
+       hv_vcpu = kzalloc(sizeof(struct kvm_vcpu_hv), GFP_KERNEL_ACCOUNT);
+       if (!hv_vcpu)
+               return -ENOMEM;
+
+       vcpu->arch.hyperv = hv_vcpu;
+       hv_vcpu->vcpu = vcpu;
+
        synic_init(&hv_vcpu->synic);
 
        bitmap_zero(hv_vcpu->stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT);
        for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++)
                stimer_init(&hv_vcpu->stimer[i], i);
-}
-
-void kvm_hv_vcpu_postcreate(struct kvm_vcpu *vcpu)
-{
-       struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
 
        hv_vcpu->vp_index = kvm_vcpu_get_idx(vcpu);
+
+       return 0;
 }
 
 int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages)
 {
-       struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
+       struct kvm_vcpu_hv_synic *synic;
+       int r;
+
+       if (!to_hv_vcpu(vcpu)) {
+               r = kvm_hv_vcpu_init(vcpu);
+               if (r)
+                       return r;
+       }
+
+       synic = to_hv_synic(vcpu);
 
        /*
         * Hyper-V SynIC auto EOI SINT's are
@@ -939,10 +967,9 @@ static bool kvm_hv_msr_partition_wide(u32 msr)
        return r;
 }
 
-static int kvm_hv_msr_get_crash_data(struct kvm_vcpu *vcpu,
-                                    u32 index, u64 *pdata)
+static int kvm_hv_msr_get_crash_data(struct kvm *kvm, u32 index, u64 *pdata)
 {
-       struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+       struct kvm_hv *hv = to_kvm_hv(kvm);
        size_t size = ARRAY_SIZE(hv->hv_crash_param);
 
        if (WARN_ON_ONCE(index >= size))
@@ -952,41 +979,26 @@ static int kvm_hv_msr_get_crash_data(struct kvm_vcpu *vcpu,
        return 0;
 }
 
-static int kvm_hv_msr_get_crash_ctl(struct kvm_vcpu *vcpu, u64 *pdata)
+static int kvm_hv_msr_get_crash_ctl(struct kvm *kvm, u64 *pdata)
 {
-       struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+       struct kvm_hv *hv = to_kvm_hv(kvm);
 
        *pdata = hv->hv_crash_ctl;
        return 0;
 }
 
-static int kvm_hv_msr_set_crash_ctl(struct kvm_vcpu *vcpu, u64 data, bool host)
+static int kvm_hv_msr_set_crash_ctl(struct kvm *kvm, u64 data)
 {
-       struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
-
-       if (host)
-               hv->hv_crash_ctl = data & HV_CRASH_CTL_CRASH_NOTIFY;
+       struct kvm_hv *hv = to_kvm_hv(kvm);
 
-       if (!host && (data & HV_CRASH_CTL_CRASH_NOTIFY)) {
-
-               vcpu_debug(vcpu, "hv crash (0x%llx 0x%llx 0x%llx 0x%llx 0x%llx)\n",
-                         hv->hv_crash_param[0],
-                         hv->hv_crash_param[1],
-                         hv->hv_crash_param[2],
-                         hv->hv_crash_param[3],
-                         hv->hv_crash_param[4]);
-
-               /* Send notification about crash to user space */
-               kvm_make_request(KVM_REQ_HV_CRASH, vcpu);
-       }
+       hv->hv_crash_ctl = data & HV_CRASH_CTL_CRASH_NOTIFY;
 
        return 0;
 }
 
-static int kvm_hv_msr_set_crash_data(struct kvm_vcpu *vcpu,
-                                    u32 index, u64 data)
+static int kvm_hv_msr_set_crash_data(struct kvm *kvm, u32 index, u64 data)
 {
-       struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+       struct kvm_hv *hv = to_kvm_hv(kvm);
        size_t size = ARRAY_SIZE(hv->hv_crash_param);
 
        if (WARN_ON_ONCE(index >= size))
@@ -1068,7 +1080,7 @@ static bool compute_tsc_page_parameters(struct pvclock_vcpu_time_info *hv_clock,
 void kvm_hv_setup_tsc_page(struct kvm *kvm,
                           struct pvclock_vcpu_time_info *hv_clock)
 {
-       struct kvm_hv *hv = &kvm->arch.hyperv;
+       struct kvm_hv *hv = to_kvm_hv(kvm);
        u32 tsc_seq;
        u64 gfn;
 
@@ -1078,7 +1090,7 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm,
        if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE))
                return;
 
-       mutex_lock(&kvm->arch.hyperv.hv_lock);
+       mutex_lock(&hv->hv_lock);
        if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE))
                goto out_unlock;
 
@@ -1122,14 +1134,14 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm,
        kvm_write_guest(kvm, gfn_to_gpa(gfn),
                        &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence));
 out_unlock:
-       mutex_unlock(&kvm->arch.hyperv.hv_lock);
+       mutex_unlock(&hv->hv_lock);
 }
 
 static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
                             bool host)
 {
        struct kvm *kvm = vcpu->kvm;
-       struct kvm_hv *hv = &kvm->arch.hyperv;
+       struct kvm_hv *hv = to_kvm_hv(kvm);
 
        switch (msr) {
        case HV_X64_MSR_GUEST_OS_ID:
@@ -1139,9 +1151,9 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
                        hv->hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE;
                break;
        case HV_X64_MSR_HYPERCALL: {
-               u64 gfn;
-               unsigned long addr;
-               u8 instructions[4];
+               u8 instructions[9];
+               int i = 0;
+               u64 addr;
 
                /* if guest os id is not set hypercall should remain disabled */
                if (!hv->hv_guest_os_id)
@@ -1150,16 +1162,33 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
                        hv->hv_hypercall = data;
                        break;
                }
-               gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT;
-               addr = gfn_to_hva(kvm, gfn);
-               if (kvm_is_error_hva(addr))
-                       return 1;
-               kvm_x86_ops.patch_hypercall(vcpu, instructions);
-               ((unsigned char *)instructions)[3] = 0xc3; /* ret */
-               if (__copy_to_user((void __user *)addr, instructions, 4))
+
+               /*
+                * If Xen and Hyper-V hypercalls are both enabled, disambiguate
+                * the same way Xen itself does, by setting the bit 31 of EAX
+                * which is RsvdZ in the 32-bit Hyper-V hypercall ABI and just
+                * going to be clobbered on 64-bit.
+                */
+               if (kvm_xen_hypercall_enabled(kvm)) {
+                       /* orl $0x80000000, %eax */
+                       instructions[i++] = 0x0d;
+                       instructions[i++] = 0x00;
+                       instructions[i++] = 0x00;
+                       instructions[i++] = 0x00;
+                       instructions[i++] = 0x80;
+               }
+
+               /* vmcall/vmmcall */
+               static_call(kvm_x86_patch_hypercall)(vcpu, instructions + i);
+               i += 3;
+
+               /* ret */
+               ((unsigned char *)instructions)[i++] = 0xc3;
+
+               addr = data & HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK;
+               if (kvm_vcpu_write_guest(vcpu, addr, instructions, i))
                        return 1;
                hv->hv_hypercall = data;
-               mark_page_dirty(kvm, gfn);
                break;
        }
        case HV_X64_MSR_REFERENCE_TSC:
@@ -1168,11 +1197,25 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
                        kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
                break;
        case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
-               return kvm_hv_msr_set_crash_data(vcpu,
+               return kvm_hv_msr_set_crash_data(kvm,
                                                 msr - HV_X64_MSR_CRASH_P0,
                                                 data);
        case HV_X64_MSR_CRASH_CTL:
-               return kvm_hv_msr_set_crash_ctl(vcpu, data, host);
+               if (host)
+                       return kvm_hv_msr_set_crash_ctl(kvm, data);
+
+               if (data & HV_CRASH_CTL_CRASH_NOTIFY) {
+                       vcpu_debug(vcpu, "hv crash (0x%llx 0x%llx 0x%llx 0x%llx 0x%llx)\n",
+                                  hv->hv_crash_param[0],
+                                  hv->hv_crash_param[1],
+                                  hv->hv_crash_param[2],
+                                  hv->hv_crash_param[3],
+                                  hv->hv_crash_param[4]);
+
+                       /* Send notification about crash to user space */
+                       kvm_make_request(KVM_REQ_HV_CRASH, vcpu);
+               }
+               break;
        case HV_X64_MSR_RESET:
                if (data == 1) {
                        vcpu_debug(vcpu, "hyper-v reset requested\n");
@@ -1216,11 +1259,11 @@ static u64 current_task_runtime_100ns(void)
 
 static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
 {
-       struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv;
+       struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
 
        switch (msr) {
        case HV_X64_MSR_VP_INDEX: {
-               struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+               struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
                int vcpu_idx = kvm_vcpu_get_idx(vcpu);
                u32 new_vp_index = (u32)data;
 
@@ -1291,14 +1334,14 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
        case HV_X64_MSR_SIMP:
        case HV_X64_MSR_EOM:
        case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
-               return synic_set_msr(vcpu_to_synic(vcpu), msr, data, host);
+               return synic_set_msr(to_hv_synic(vcpu), msr, data, host);
        case HV_X64_MSR_STIMER0_CONFIG:
        case HV_X64_MSR_STIMER1_CONFIG:
        case HV_X64_MSR_STIMER2_CONFIG:
        case HV_X64_MSR_STIMER3_CONFIG: {
                int timer_index = (msr - HV_X64_MSR_STIMER0_CONFIG)/2;
 
-               return stimer_set_config(vcpu_to_stimer(vcpu, timer_index),
+               return stimer_set_config(to_hv_stimer(vcpu, timer_index),
                                         data, host);
        }
        case HV_X64_MSR_STIMER0_COUNT:
@@ -1307,7 +1350,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
        case HV_X64_MSR_STIMER3_COUNT: {
                int timer_index = (msr - HV_X64_MSR_STIMER0_COUNT)/2;
 
-               return stimer_set_count(vcpu_to_stimer(vcpu, timer_index),
+               return stimer_set_count(to_hv_stimer(vcpu, timer_index),
                                        data, host);
        }
        case HV_X64_MSR_TSC_FREQUENCY:
@@ -1330,7 +1373,7 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
 {
        u64 data = 0;
        struct kvm *kvm = vcpu->kvm;
-       struct kvm_hv *hv = &kvm->arch.hyperv;
+       struct kvm_hv *hv = to_kvm_hv(kvm);
 
        switch (msr) {
        case HV_X64_MSR_GUEST_OS_ID:
@@ -1346,11 +1389,11 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
                data = hv->hv_tsc_page;
                break;
        case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
-               return kvm_hv_msr_get_crash_data(vcpu,
+               return kvm_hv_msr_get_crash_data(kvm,
                                                 msr - HV_X64_MSR_CRASH_P0,
                                                 pdata);
        case HV_X64_MSR_CRASH_CTL:
-               return kvm_hv_msr_get_crash_ctl(vcpu, pdata);
+               return kvm_hv_msr_get_crash_ctl(kvm, pdata);
        case HV_X64_MSR_RESET:
                data = 0;
                break;
@@ -1379,7 +1422,7 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
                          bool host)
 {
        u64 data = 0;
-       struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv;
+       struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
 
        switch (msr) {
        case HV_X64_MSR_VP_INDEX:
@@ -1403,14 +1446,14 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
        case HV_X64_MSR_SIMP:
        case HV_X64_MSR_EOM:
        case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
-               return synic_get_msr(vcpu_to_synic(vcpu), msr, pdata, host);
+               return synic_get_msr(to_hv_synic(vcpu), msr, pdata, host);
        case HV_X64_MSR_STIMER0_CONFIG:
        case HV_X64_MSR_STIMER1_CONFIG:
        case HV_X64_MSR_STIMER2_CONFIG:
        case HV_X64_MSR_STIMER3_CONFIG: {
                int timer_index = (msr - HV_X64_MSR_STIMER0_CONFIG)/2;
 
-               return stimer_get_config(vcpu_to_stimer(vcpu, timer_index),
+               return stimer_get_config(to_hv_stimer(vcpu, timer_index),
                                         pdata);
        }
        case HV_X64_MSR_STIMER0_COUNT:
@@ -1419,7 +1462,7 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
        case HV_X64_MSR_STIMER3_COUNT: {
                int timer_index = (msr - HV_X64_MSR_STIMER0_COUNT)/2;
 
-               return stimer_get_count(vcpu_to_stimer(vcpu, timer_index),
+               return stimer_get_count(to_hv_stimer(vcpu, timer_index),
                                        pdata);
        }
        case HV_X64_MSR_TSC_FREQUENCY:
@@ -1438,12 +1481,22 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
 
 int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
 {
+       struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
+
+       if (!host && !vcpu->arch.hyperv_enabled)
+               return 1;
+
+       if (!to_hv_vcpu(vcpu)) {
+               if (kvm_hv_vcpu_init(vcpu))
+                       return 1;
+       }
+
        if (kvm_hv_msr_partition_wide(msr)) {
                int r;
 
-               mutex_lock(&vcpu->kvm->arch.hyperv.hv_lock);
+               mutex_lock(&hv->hv_lock);
                r = kvm_hv_set_msr_pw(vcpu, msr, data, host);
-               mutex_unlock(&vcpu->kvm->arch.hyperv.hv_lock);
+               mutex_unlock(&hv->hv_lock);
                return r;
        } else
                return kvm_hv_set_msr(vcpu, msr, data, host);
@@ -1451,12 +1504,22 @@ int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
 
 int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
 {
+       struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
+
+       if (!host && !vcpu->arch.hyperv_enabled)
+               return 1;
+
+       if (!to_hv_vcpu(vcpu)) {
+               if (kvm_hv_vcpu_init(vcpu))
+                       return 1;
+       }
+
        if (kvm_hv_msr_partition_wide(msr)) {
                int r;
 
-               mutex_lock(&vcpu->kvm->arch.hyperv.hv_lock);
+               mutex_lock(&hv->hv_lock);
                r = kvm_hv_get_msr_pw(vcpu, msr, pdata, host);
-               mutex_unlock(&vcpu->kvm->arch.hyperv.hv_lock);
+               mutex_unlock(&hv->hv_lock);
                return r;
        } else
                return kvm_hv_get_msr(vcpu, msr, pdata, host);
@@ -1466,7 +1529,7 @@ static __always_inline unsigned long *sparse_set_to_vcpu_mask(
        struct kvm *kvm, u64 *sparse_banks, u64 valid_bank_mask,
        u64 *vp_bitmap, unsigned long *vcpu_bitmap)
 {
-       struct kvm_hv *hv = &kvm->arch.hyperv;
+       struct kvm_hv *hv = to_kvm_hv(kvm);
        struct kvm_vcpu *vcpu;
        int i, bank, sbank = 0;
 
@@ -1483,18 +1546,16 @@ static __always_inline unsigned long *sparse_set_to_vcpu_mask(
 
        bitmap_zero(vcpu_bitmap, KVM_MAX_VCPUS);
        kvm_for_each_vcpu(i, vcpu, kvm) {
-               if (test_bit(vcpu_to_hv_vcpu(vcpu)->vp_index,
-                            (unsigned long *)vp_bitmap))
+               if (test_bit(kvm_hv_get_vpindex(vcpu), (unsigned long *)vp_bitmap))
                        __set_bit(i, vcpu_bitmap);
        }
        return vcpu_bitmap;
 }
 
-static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa,
-                           u16 rep_cnt, bool ex)
+static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, u64 ingpa, u16 rep_cnt, bool ex)
 {
-       struct kvm *kvm = current_vcpu->kvm;
-       struct kvm_vcpu_hv *hv_vcpu = &current_vcpu->arch.hyperv;
+       struct kvm *kvm = vcpu->kvm;
+       struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
        struct hv_tlb_flush_ex flush_ex;
        struct hv_tlb_flush flush;
        u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
@@ -1592,10 +1653,10 @@ static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector,
        }
 }
 
-static u64 kvm_hv_send_ipi(struct kvm_vcpu *current_vcpu, u64 ingpa, u64 outgpa,
+static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, u64 ingpa, u64 outgpa,
                           bool ex, bool fast)
 {
-       struct kvm *kvm = current_vcpu->kvm;
+       struct kvm *kvm = vcpu->kvm;
        struct hv_send_ipi_ex send_ipi_ex;
        struct hv_send_ipi send_ipi;
        u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
@@ -1666,9 +1727,20 @@ ret_success:
        return HV_STATUS_SUCCESS;
 }
 
-bool kvm_hv_hypercall_enabled(struct kvm *kvm)
+void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu)
 {
-       return READ_ONCE(kvm->arch.hyperv.hv_guest_os_id) != 0;
+       struct kvm_cpuid_entry2 *entry;
+
+       entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_INTERFACE, 0);
+       if (entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX)
+               vcpu->arch.hyperv_enabled = true;
+       else
+               vcpu->arch.hyperv_enabled = false;
+}
+
+bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.hyperv_enabled && to_kvm_hv(vcpu->kvm)->hv_guest_os_id;
 }
 
 static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
@@ -1698,6 +1770,7 @@ static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
 
 static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, bool fast, u64 param)
 {
+       struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
        struct eventfd_ctx *eventfd;
 
        if (unlikely(!fast)) {
@@ -1726,7 +1799,7 @@ static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, bool fast, u64 param)
 
        /* the eventfd is protected by vcpu->kvm->srcu, but conn_to_evt isn't */
        rcu_read_lock();
-       eventfd = idr_find(&vcpu->kvm->arch.hyperv.conn_to_evt, param);
+       eventfd = idr_find(&hv->conn_to_evt, param);
        rcu_read_unlock();
        if (!eventfd)
                return HV_STATUS_INVALID_PORT_ID;
@@ -1745,7 +1818,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
         * hypercall generates UD from non zero cpl and real mode
         * per HYPER-V spec
         */
-       if (kvm_x86_ops.get_cpl(vcpu) != 0 || !is_protmode(vcpu)) {
+       if (static_call(kvm_x86_get_cpl)(vcpu) != 0 || !is_protmode(vcpu)) {
                kvm_queue_exception(vcpu, UD_VECTOR);
                return 1;
        }
@@ -1793,7 +1866,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
                fallthrough;    /* maybe userspace knows this conn_id */
        case HVCALL_POST_MESSAGE:
                /* don't bother userspace if it has no way to handle it */
-               if (unlikely(rep || !vcpu_to_synic(vcpu)->active)) {
+               if (unlikely(rep || !to_hv_synic(vcpu)->active)) {
                        ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
                        break;
                }
@@ -1855,7 +1928,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
                }
                fallthrough;
        case HVCALL_RESET_DEBUG_SESSION: {
-               struct kvm_hv_syndbg *syndbg = vcpu_to_hv_syndbg(vcpu);
+               struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu);
 
                if (!kvm_hv_is_syndbg_enabled(vcpu)) {
                        ret = HV_STATUS_INVALID_HYPERCALL_CODE;
@@ -1885,23 +1958,26 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
 
 void kvm_hv_init_vm(struct kvm *kvm)
 {
-       mutex_init(&kvm->arch.hyperv.hv_lock);
-       idr_init(&kvm->arch.hyperv.conn_to_evt);
+       struct kvm_hv *hv = to_kvm_hv(kvm);
+
+       mutex_init(&hv->hv_lock);
+       idr_init(&hv->conn_to_evt);
 }
 
 void kvm_hv_destroy_vm(struct kvm *kvm)
 {
+       struct kvm_hv *hv = to_kvm_hv(kvm);
        struct eventfd_ctx *eventfd;
        int i;
 
-       idr_for_each_entry(&kvm->arch.hyperv.conn_to_evt, eventfd, i)
+       idr_for_each_entry(&hv->conn_to_evt, eventfd, i)
                eventfd_ctx_put(eventfd);
-       idr_destroy(&kvm->arch.hyperv.conn_to_evt);
+       idr_destroy(&hv->conn_to_evt);
 }
 
 static int kvm_hv_eventfd_assign(struct kvm *kvm, u32 conn_id, int fd)
 {
-       struct kvm_hv *hv = &kvm->arch.hyperv;
+       struct kvm_hv *hv = to_kvm_hv(kvm);
        struct eventfd_ctx *eventfd;
        int ret;
 
@@ -1925,7 +2001,7 @@ static int kvm_hv_eventfd_assign(struct kvm *kvm, u32 conn_id, int fd)
 
 static int kvm_hv_eventfd_deassign(struct kvm *kvm, u32 conn_id)
 {
-       struct kvm_hv *hv = &kvm->arch.hyperv;
+       struct kvm_hv *hv = to_kvm_hv(kvm);
        struct eventfd_ctx *eventfd;
 
        mutex_lock(&hv->hv_lock);
@@ -1997,8 +2073,7 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
                        break;
 
                case HYPERV_CPUID_INTERFACE:
-                       memcpy(signature, "Hv#1\0\0\0\0\0\0\0\0", 12);
-                       ent->eax = signature[0];
+                       ent->eax = HYPERV_CPUID_SIGNATURE_EAX;
                        break;
 
                case HYPERV_CPUID_VERSION:
index 6d7def2..e951af1 100644 (file)
 /* Hyper-V HV_X64_MSR_SYNDBG_OPTIONS bits */
 #define HV_X64_SYNDBG_OPTION_USE_HCALLS                BIT(2)
 
-static inline struct kvm_vcpu_hv *vcpu_to_hv_vcpu(struct kvm_vcpu *vcpu)
+static inline struct kvm_hv *to_kvm_hv(struct kvm *kvm)
 {
-       return &vcpu->arch.hyperv;
+       return &kvm->arch.hyperv;
 }
 
-static inline struct kvm_vcpu *hv_vcpu_to_vcpu(struct kvm_vcpu_hv *hv_vcpu)
+static inline struct kvm_vcpu_hv *to_hv_vcpu(struct kvm_vcpu *vcpu)
 {
-       struct kvm_vcpu_arch *arch;
-
-       arch = container_of(hv_vcpu, struct kvm_vcpu_arch, hyperv);
-       return container_of(arch, struct kvm_vcpu, arch);
+       return vcpu->arch.hyperv;
 }
 
-static inline struct kvm_vcpu_hv_synic *vcpu_to_synic(struct kvm_vcpu *vcpu)
+static inline struct kvm_vcpu_hv_synic *to_hv_synic(struct kvm_vcpu *vcpu)
 {
-       return &vcpu->arch.hyperv.synic;
+       struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+       return &hv_vcpu->synic;
 }
 
-static inline struct kvm_vcpu *synic_to_vcpu(struct kvm_vcpu_hv_synic *synic)
+static inline struct kvm_vcpu *hv_synic_to_vcpu(struct kvm_vcpu_hv_synic *synic)
 {
-       return hv_vcpu_to_vcpu(container_of(synic, struct kvm_vcpu_hv, synic));
+       struct kvm_vcpu_hv *hv_vcpu = container_of(synic, struct kvm_vcpu_hv, synic);
+
+       return hv_vcpu->vcpu;
 }
 
-static inline struct kvm_hv_syndbg *vcpu_to_hv_syndbg(struct kvm_vcpu *vcpu)
+static inline struct kvm_hv_syndbg *to_hv_syndbg(struct kvm_vcpu *vcpu)
 {
        return &vcpu->kvm->arch.hyperv.hv_syndbg;
 }
 
+static inline u32 kvm_hv_get_vpindex(struct kvm_vcpu *vcpu)
+{
+       struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+       return hv_vcpu ? hv_vcpu->vp_index : kvm_vcpu_get_idx(vcpu);
+}
+
 int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host);
 int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host);
 
-bool kvm_hv_hypercall_enabled(struct kvm *kvm);
+bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu);
 int kvm_hv_hypercall(struct kvm_vcpu *vcpu);
 
 void kvm_hv_irq_routing_update(struct kvm *kvm);
@@ -89,32 +97,35 @@ int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint);
 void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector);
 int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages);
 
-void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu);
-void kvm_hv_vcpu_postcreate(struct kvm_vcpu *vcpu);
 void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu);
 
 bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu);
 bool kvm_hv_get_assist_page(struct kvm_vcpu *vcpu,
                            struct hv_vp_assist_page *assist_page);
 
-static inline struct kvm_vcpu_hv_stimer *vcpu_to_stimer(struct kvm_vcpu *vcpu,
-                                                       int timer_index)
+static inline struct kvm_vcpu_hv_stimer *to_hv_stimer(struct kvm_vcpu *vcpu,
+                                                     int timer_index)
 {
-       return &vcpu_to_hv_vcpu(vcpu)->stimer[timer_index];
+       return &to_hv_vcpu(vcpu)->stimer[timer_index];
 }
 
-static inline struct kvm_vcpu *stimer_to_vcpu(struct kvm_vcpu_hv_stimer *stimer)
+static inline struct kvm_vcpu *hv_stimer_to_vcpu(struct kvm_vcpu_hv_stimer *stimer)
 {
        struct kvm_vcpu_hv *hv_vcpu;
 
        hv_vcpu = container_of(stimer - stimer->index, struct kvm_vcpu_hv,
                               stimer[0]);
-       return hv_vcpu_to_vcpu(hv_vcpu);
+       return hv_vcpu->vcpu;
 }
 
 static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu)
 {
-       return !bitmap_empty(vcpu->arch.hyperv.stimer_pending_bitmap,
+       struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+       if (!hv_vcpu)
+               return false;
+
+       return !bitmap_empty(hv_vcpu->stimer_pending_bitmap,
                             HV_SYNIC_STIMER_COUNT);
 }
 
@@ -125,6 +136,7 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm,
 
 void kvm_hv_init_vm(struct kvm *kvm);
 void kvm_hv_destroy_vm(struct kvm *kvm);
+void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu);
 int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args);
 int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
                     struct kvm_cpuid_entry2 __user *entries);
index 814698e..172b053 100644 (file)
@@ -14,6 +14,7 @@
 #include "irq.h"
 #include "i8254.h"
 #include "x86.h"
+#include "xen.h"
 
 /*
  * check if there are pending timer events
@@ -56,6 +57,9 @@ int kvm_cpu_has_extint(struct kvm_vcpu *v)
        if (!lapic_in_kernel(v))
                return v->arch.interrupt.injected;
 
+       if (kvm_xen_has_interrupt(v))
+               return 1;
+
        if (!kvm_apic_accept_pic_intr(v))
                return 0;
 
@@ -110,6 +114,9 @@ static int kvm_cpu_get_extint(struct kvm_vcpu *v)
        if (!lapic_in_kernel(v))
                return v->arch.interrupt.nr;
 
+       if (kvm_xen_has_interrupt(v))
+               return v->kvm->arch.xen.upcall_vector;
+
        if (irqchip_split(v->kvm)) {
                int vector = v->arch.pending_external_vector;
 
@@ -143,8 +150,7 @@ void __kvm_migrate_timers(struct kvm_vcpu *vcpu)
 {
        __kvm_migrate_apic_timer(vcpu);
        __kvm_migrate_pit_timer(vcpu);
-       if (kvm_x86_ops.migrate_timers)
-               kvm_x86_ops.migrate_timers(vcpu);
+       static_call_cond(kvm_x86_migrate_timers)(vcpu);
 }
 
 bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args)
index f15bc16..2e11da2 100644 (file)
@@ -9,31 +9,6 @@
        (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR  \
         | X86_CR4_OSXMMEXCPT | X86_CR4_PGE | X86_CR4_TSD | X86_CR4_FSGSBASE)
 
-static inline bool kvm_register_is_available(struct kvm_vcpu *vcpu,
-                                            enum kvm_reg reg)
-{
-       return test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
-}
-
-static inline bool kvm_register_is_dirty(struct kvm_vcpu *vcpu,
-                                        enum kvm_reg reg)
-{
-       return test_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
-}
-
-static inline void kvm_register_mark_available(struct kvm_vcpu *vcpu,
-                                              enum kvm_reg reg)
-{
-       __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
-}
-
-static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu,
-                                          enum kvm_reg reg)
-{
-       __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
-       __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
-}
-
 #define BUILD_KVM_GPR_ACCESSORS(lname, uname)                                \
 static __always_inline unsigned long kvm_##lname##_read(struct kvm_vcpu *vcpu)\
 {                                                                            \
@@ -43,7 +18,6 @@ static __always_inline void kvm_##lname##_write(struct kvm_vcpu *vcpu,              \
                                                unsigned long val)            \
 {                                                                            \
        vcpu->arch.regs[VCPU_REGS_##uname] = val;                             \
-       kvm_register_mark_dirty(vcpu, VCPU_REGS_##uname);                     \
 }
 BUILD_KVM_GPR_ACCESSORS(rax, RAX)
 BUILD_KVM_GPR_ACCESSORS(rbx, RBX)
@@ -63,13 +37,38 @@ BUILD_KVM_GPR_ACCESSORS(r14, R14)
 BUILD_KVM_GPR_ACCESSORS(r15, R15)
 #endif
 
+static inline bool kvm_register_is_available(struct kvm_vcpu *vcpu,
+                                            enum kvm_reg reg)
+{
+       return test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+}
+
+static inline bool kvm_register_is_dirty(struct kvm_vcpu *vcpu,
+                                        enum kvm_reg reg)
+{
+       return test_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
+}
+
+static inline void kvm_register_mark_available(struct kvm_vcpu *vcpu,
+                                              enum kvm_reg reg)
+{
+       __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+}
+
+static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu,
+                                          enum kvm_reg reg)
+{
+       __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+       __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
+}
+
 static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, int reg)
 {
        if (WARN_ON_ONCE((unsigned int)reg >= NR_VCPU_REGS))
                return 0;
 
        if (!kvm_register_is_available(vcpu, reg))
-               kvm_x86_ops.cache_reg(vcpu, reg);
+               static_call(kvm_x86_cache_reg)(vcpu, reg);
 
        return vcpu->arch.regs[reg];
 }
@@ -109,7 +108,7 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
        might_sleep();  /* on svm */
 
        if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR))
-               kvm_x86_ops.cache_reg(vcpu, VCPU_EXREG_PDPTR);
+               static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_PDPTR);
 
        return vcpu->arch.walk_mmu->pdptrs[index];
 }
@@ -119,7 +118,7 @@ static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
        ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS;
        if ((tmask & vcpu->arch.cr0_guest_owned_bits) &&
            !kvm_register_is_available(vcpu, VCPU_EXREG_CR0))
-               kvm_x86_ops.cache_reg(vcpu, VCPU_EXREG_CR0);
+               static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_CR0);
        return vcpu->arch.cr0 & mask;
 }
 
@@ -133,14 +132,14 @@ static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
        ulong tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS;
        if ((tmask & vcpu->arch.cr4_guest_owned_bits) &&
            !kvm_register_is_available(vcpu, VCPU_EXREG_CR4))
-               kvm_x86_ops.cache_reg(vcpu, VCPU_EXREG_CR4);
+               static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_CR4);
        return vcpu->arch.cr4 & mask;
 }
 
 static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu)
 {
        if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
-               kvm_x86_ops.cache_reg(vcpu, VCPU_EXREG_CR3);
+               static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_CR3);
        return vcpu->arch.cr3;
 }
 
index 43c93ff..0d35911 100644 (file)
@@ -205,7 +205,7 @@ struct x86_emulate_ops {
        ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr);
        int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val);
        int (*cpl)(struct x86_emulate_ctxt *ctxt);
-       int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest);
+       void (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest);
        int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
        u64 (*get_smbase)(struct x86_emulate_ctxt *ctxt);
        void (*set_smbase)(struct x86_emulate_ctxt *ctxt, u64 smbase);
index 43ccead..45d40bf 100644 (file)
@@ -91,8 +91,8 @@ static inline int __apic_test_and_clear_vector(int vec, void *bitmap)
        return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
 }
 
-struct static_key_deferred apic_hw_disabled __read_mostly;
-struct static_key_deferred apic_sw_disabled __read_mostly;
+__read_mostly DEFINE_STATIC_KEY_DEFERRED_FALSE(apic_hw_disabled, HZ);
+__read_mostly DEFINE_STATIC_KEY_DEFERRED_FALSE(apic_sw_disabled, HZ);
 
 static inline int apic_enabled(struct kvm_lapic *apic)
 {
@@ -290,9 +290,9 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
        if (enabled != apic->sw_enabled) {
                apic->sw_enabled = enabled;
                if (enabled)
-                       static_key_slow_dec_deferred(&apic_sw_disabled);
+                       static_branch_slow_dec_deferred(&apic_sw_disabled);
                else
-                       static_key_slow_inc(&apic_sw_disabled.key);
+                       static_branch_inc(&apic_sw_disabled.key);
 
                atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
        }
@@ -484,7 +484,7 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
        if (unlikely(vcpu->arch.apicv_active)) {
                /* need to update RVI */
                kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR);
-               kvm_x86_ops.hwapic_irr_update(vcpu,
+               static_call(kvm_x86_hwapic_irr_update)(vcpu,
                                apic_find_highest_irr(apic));
        } else {
                apic->irr_pending = false;
@@ -515,7 +515,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
         * just set SVI.
         */
        if (unlikely(vcpu->arch.apicv_active))
-               kvm_x86_ops.hwapic_isr_update(vcpu, vec);
+               static_call(kvm_x86_hwapic_isr_update)(vcpu, vec);
        else {
                ++apic->isr_count;
                BUG_ON(apic->isr_count > MAX_APIC_VECTOR);
@@ -563,8 +563,8 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
         * and must be left alone.
         */
        if (unlikely(vcpu->arch.apicv_active))
-               kvm_x86_ops.hwapic_isr_update(vcpu,
-                                              apic_find_highest_isr(apic));
+               static_call(kvm_x86_hwapic_isr_update)(vcpu,
+                                               apic_find_highest_isr(apic));
        else {
                --apic->isr_count;
                BUG_ON(apic->isr_count < 0);
@@ -701,7 +701,7 @@ static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr)
 {
        int highest_irr;
        if (apic->vcpu->arch.apicv_active)
-               highest_irr = kvm_x86_ops.sync_pir_to_irr(apic->vcpu);
+               highest_irr = static_call(kvm_x86_sync_pir_to_irr)(apic->vcpu);
        else
                highest_irr = apic_find_highest_irr(apic);
        if (highest_irr == -1 || (highest_irr & 0xF0) <= ppr)
@@ -1090,7 +1090,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
                                                       apic->regs + APIC_TMR);
                }
 
-               if (kvm_x86_ops.deliver_posted_interrupt(vcpu, vector)) {
+               if (static_call(kvm_x86_deliver_posted_interrupt)(vcpu, vector)) {
                        kvm_lapic_set_irr(vector, apic);
                        kvm_make_request(KVM_REQ_EVENT, vcpu);
                        kvm_vcpu_kick(vcpu);
@@ -1245,7 +1245,8 @@ static int apic_set_eoi(struct kvm_lapic *apic)
        apic_clear_isr(vector, apic);
        apic_update_ppr(apic);
 
-       if (test_bit(vector, vcpu_to_synic(apic->vcpu)->vec_bitmap))
+       if (to_hv_vcpu(apic->vcpu) &&
+           test_bit(vector, to_hv_synic(apic->vcpu)->vec_bitmap))
                kvm_hv_synic_send_eoi(apic->vcpu, vector);
 
        kvm_ioapic_send_eoi(apic, vector);
@@ -1814,7 +1815,7 @@ static void cancel_hv_timer(struct kvm_lapic *apic)
 {
        WARN_ON(preemptible());
        WARN_ON(!apic->lapic_timer.hv_timer_in_use);
-       kvm_x86_ops.cancel_hv_timer(apic->vcpu);
+       static_call(kvm_x86_cancel_hv_timer)(apic->vcpu);
        apic->lapic_timer.hv_timer_in_use = false;
 }
 
@@ -1831,7 +1832,7 @@ static bool start_hv_timer(struct kvm_lapic *apic)
        if (!ktimer->tscdeadline)
                return false;
 
-       if (kvm_x86_ops.set_hv_timer(vcpu, ktimer->tscdeadline, &expired))
+       if (static_call(kvm_x86_set_hv_timer)(vcpu, ktimer->tscdeadline, &expired))
                return false;
 
        ktimer->hv_timer_in_use = true;
@@ -2175,10 +2176,10 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
        hrtimer_cancel(&apic->lapic_timer.timer);
 
        if (!(vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE))
-               static_key_slow_dec_deferred(&apic_hw_disabled);
+               static_branch_slow_dec_deferred(&apic_hw_disabled);
 
        if (!apic->sw_enabled)
-               static_key_slow_dec_deferred(&apic_sw_disabled);
+               static_branch_slow_dec_deferred(&apic_sw_disabled);
 
        if (apic->regs)
                free_page((unsigned long)apic->regs);
@@ -2250,9 +2251,9 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
        if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) {
                if (value & MSR_IA32_APICBASE_ENABLE) {
                        kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
-                       static_key_slow_dec_deferred(&apic_hw_disabled);
+                       static_branch_slow_dec_deferred(&apic_hw_disabled);
                } else {
-                       static_key_slow_inc(&apic_hw_disabled.key);
+                       static_branch_inc(&apic_hw_disabled.key);
                        atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
                }
        }
@@ -2261,7 +2262,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
                kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id);
 
        if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE))
-               kvm_x86_ops.set_virtual_apic_mode(vcpu);
+               static_call(kvm_x86_set_virtual_apic_mode)(vcpu);
 
        apic->base_address = apic->vcpu->arch.apic_base &
                             MSR_IA32_APICBASE_BASE;
@@ -2338,9 +2339,9 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
        vcpu->arch.pv_eoi.msr_val = 0;
        apic_update_ppr(apic);
        if (vcpu->arch.apicv_active) {
-               kvm_x86_ops.apicv_post_state_restore(vcpu);
-               kvm_x86_ops.hwapic_irr_update(vcpu, -1);
-               kvm_x86_ops.hwapic_isr_update(vcpu, -1);
+               static_call(kvm_x86_apicv_post_state_restore)(vcpu);
+               static_call(kvm_x86_hwapic_irr_update)(vcpu, -1);
+               static_call(kvm_x86_hwapic_isr_update)(vcpu, -1);
        }
 
        vcpu->arch.apic_arb_prio = 0;
@@ -2449,7 +2450,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
         * thinking that APIC state has changed.
         */
        vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE;
-       static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */
+       static_branch_inc(&apic_sw_disabled.key); /* sw disabled at reset */
        kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
 
        return 0;
@@ -2512,7 +2513,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
         */
 
        apic_clear_irr(vector, apic);
-       if (test_bit(vector, vcpu_to_synic(vcpu)->auto_eoi_bitmap)) {
+       if (to_hv_vcpu(vcpu) && test_bit(vector, to_hv_synic(vcpu)->auto_eoi_bitmap)) {
                /*
                 * For auto-EOI interrupts, there might be another pending
                 * interrupt above PPR, so check whether to raise another
@@ -2601,10 +2602,10 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
        kvm_apic_update_apicv(vcpu);
        apic->highest_isr_cache = -1;
        if (vcpu->arch.apicv_active) {
-               kvm_x86_ops.apicv_post_state_restore(vcpu);
-               kvm_x86_ops.hwapic_irr_update(vcpu,
+               static_call(kvm_x86_apicv_post_state_restore)(vcpu);
+               static_call(kvm_x86_hwapic_irr_update)(vcpu,
                                apic_find_highest_irr(apic));
-               kvm_x86_ops.hwapic_isr_update(vcpu,
+               static_call(kvm_x86_hwapic_isr_update)(vcpu,
                                apic_find_highest_isr(apic));
        }
        kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -2904,13 +2905,6 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
        }
 }
 
-void kvm_lapic_init(void)
-{
-       /* do not patch jump label more than once per second */
-       jump_label_rate_limit(&apic_hw_disabled, HZ);
-       jump_label_rate_limit(&apic_sw_disabled, HZ);
-}
-
 void kvm_lapic_exit(void)
 {
        static_key_deferred_flush(&apic_hw_disabled);
index 4fb86e3..997c45a 100644 (file)
@@ -6,6 +6,8 @@
 
 #include <linux/kvm_host.h>
 
+#include "hyperv.h"
+
 #define KVM_APIC_INIT          0
 #define KVM_APIC_SIPI          1
 #define KVM_APIC_LVT_NUM       6
@@ -125,13 +127,7 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
 int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data);
 int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
 
-static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
-{
-       return vcpu->arch.hyperv.hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE;
-}
-
 int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len);
-void kvm_lapic_init(void);
 void kvm_lapic_exit(void);
 
 #define VEC_POS(v) ((v) & (32 - 1))
@@ -172,29 +168,29 @@ static inline void kvm_lapic_set_reg(struct kvm_lapic *apic, int reg_off, u32 va
        __kvm_lapic_set_reg(apic->regs, reg_off, val);
 }
 
-extern struct static_key kvm_no_apic_vcpu;
+DECLARE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu);
 
 static inline bool lapic_in_kernel(struct kvm_vcpu *vcpu)
 {
-       if (static_key_false(&kvm_no_apic_vcpu))
+       if (static_branch_unlikely(&kvm_has_noapic_vcpu))
                return vcpu->arch.apic;
        return true;
 }
 
-extern struct static_key_deferred apic_hw_disabled;
+extern struct static_key_false_deferred apic_hw_disabled;
 
 static inline int kvm_apic_hw_enabled(struct kvm_lapic *apic)
 {
-       if (static_key_false(&apic_hw_disabled.key))
+       if (static_branch_unlikely(&apic_hw_disabled.key))
                return apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
        return MSR_IA32_APICBASE_ENABLE;
 }
 
-extern struct static_key_deferred apic_sw_disabled;
+extern struct static_key_false_deferred apic_sw_disabled;
 
 static inline bool kvm_apic_sw_enabled(struct kvm_lapic *apic)
 {
-       if (static_key_false(&apic_sw_disabled.key))
+       if (static_branch_unlikely(&apic_sw_disabled.key))
                return apic->sw_enabled;
        return true;
 }
index 581925e..c68bfc3 100644 (file)
 #define PT32_ROOT_LEVEL 2
 #define PT32E_ROOT_LEVEL 3
 
-static inline u64 rsvd_bits(int s, int e)
+static __always_inline u64 rsvd_bits(int s, int e)
 {
+       BUILD_BUG_ON(__builtin_constant_p(e) && __builtin_constant_p(s) && e < s);
+
+       if (__builtin_constant_p(e))
+               BUILD_BUG_ON(e > 63);
+       else
+               e &= 63;
+
        if (e < s)
                return 0;
 
@@ -95,7 +102,7 @@ static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu)
        if (!VALID_PAGE(root_hpa))
                return;
 
-       kvm_x86_ops.load_mmu_pgd(vcpu, root_hpa | kvm_get_active_pcid(vcpu),
+       static_call(kvm_x86_load_mmu_pgd)(vcpu, root_hpa | kvm_get_active_pcid(vcpu),
                                 vcpu->arch.mmu->shadow_root_level);
 }
 
@@ -145,7 +152,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
  *
  * TODO: introduce APIs to split these two cases.
  */
-static inline int is_writable_pte(unsigned long pte)
+static inline bool is_writable_pte(unsigned long pte)
 {
        return pte & PT_WRITABLE_MASK;
 }
@@ -167,8 +174,8 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
                                  unsigned pte_access, unsigned pte_pkey,
                                  unsigned pfec)
 {
-       int cpl = kvm_x86_ops.get_cpl(vcpu);
-       unsigned long rflags = kvm_x86_ops.get_rflags(vcpu);
+       int cpl = static_call(kvm_x86_get_cpl)(vcpu);
+       unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu);
 
        /*
         * If CPL < 3, SMAP prevention are disabled if EFLAGS.AC = 1.
index 6d16481..e507568 100644 (file)
@@ -190,7 +190,7 @@ static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm,
        int ret = -ENOTSUPP;
 
        if (range && kvm_x86_ops.tlb_remote_flush_with_range)
-               ret = kvm_x86_ops.tlb_remote_flush_with_range(kvm, range);
+               ret = static_call(kvm_x86_tlb_remote_flush_with_range)(kvm, range);
 
        if (ret)
                kvm_flush_remote_tlbs(kvm);
@@ -844,17 +844,17 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
        int i, count = 0;
 
        if (!rmap_head->val) {
-               rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte);
+               rmap_printk("%p %llx 0->1\n", spte, *spte);
                rmap_head->val = (unsigned long)spte;
        } else if (!(rmap_head->val & 1)) {
-               rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte);
+               rmap_printk("%p %llx 1->many\n", spte, *spte);
                desc = mmu_alloc_pte_list_desc(vcpu);
                desc->sptes[0] = (u64 *)rmap_head->val;
                desc->sptes[1] = spte;
                rmap_head->val = (unsigned long)desc | 1;
                ++count;
        } else {
-               rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);
+               rmap_printk("%p %llx many->many\n", spte, *spte);
                desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
                while (desc->sptes[PTE_LIST_EXT-1]) {
                        count += PTE_LIST_EXT;
@@ -906,14 +906,14 @@ static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
                pr_err("%s: %p 0->BUG\n", __func__, spte);
                BUG();
        } else if (!(rmap_head->val & 1)) {
-               rmap_printk("%s:  %p 1->0\n", __func__, spte);
+               rmap_printk("%p 1->0\n", spte);
                if ((u64 *)rmap_head->val != spte) {
                        pr_err("%s:  %p 1->BUG\n", __func__, spte);
                        BUG();
                }
                rmap_head->val = 0;
        } else {
-               rmap_printk("%s:  %p many->many\n", __func__, spte);
+               rmap_printk("%p many->many\n", spte);
                desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
                prev_desc = NULL;
                while (desc) {
@@ -1115,7 +1115,7 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect)
              !(pt_protect && spte_can_locklessly_be_made_writable(spte)))
                return false;
 
-       rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
+       rmap_printk("spte %p %llx\n", sptep, *sptep);
 
        if (pt_protect)
                spte &= ~SPTE_MMU_WRITEABLE;
@@ -1142,7 +1142,7 @@ static bool spte_clear_dirty(u64 *sptep)
 {
        u64 spte = *sptep;
 
-       rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep);
+       rmap_printk("spte %p %llx\n", sptep, *sptep);
 
        MMU_WARN_ON(!spte_ad_enabled(spte));
        spte &= ~shadow_dirty_mask;
@@ -1184,7 +1184,7 @@ static bool spte_set_dirty(u64 *sptep)
 {
        u64 spte = *sptep;
 
-       rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep);
+       rmap_printk("spte %p %llx\n", sptep, *sptep);
 
        /*
         * Similar to the !kvm_x86_ops.slot_disable_log_dirty case,
@@ -1225,7 +1225,7 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
 {
        struct kvm_rmap_head *rmap_head;
 
-       if (kvm->arch.tdp_mmu_enabled)
+       if (is_tdp_mmu_enabled(kvm))
                kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
                                slot->base_gfn + gfn_offset, mask, true);
        while (mask) {
@@ -1254,7 +1254,7 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
 {
        struct kvm_rmap_head *rmap_head;
 
-       if (kvm->arch.tdp_mmu_enabled)
+       if (is_tdp_mmu_enabled(kvm))
                kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
                                slot->base_gfn + gfn_offset, mask, false);
        while (mask) {
@@ -1283,8 +1283,9 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
                                gfn_t gfn_offset, unsigned long mask)
 {
        if (kvm_x86_ops.enable_log_dirty_pt_masked)
-               kvm_x86_ops.enable_log_dirty_pt_masked(kvm, slot, gfn_offset,
-                               mask);
+               static_call(kvm_x86_enable_log_dirty_pt_masked)(kvm, slot,
+                                                               gfn_offset,
+                                                               mask);
        else
                kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
 }
@@ -1292,7 +1293,7 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
 int kvm_cpu_dirty_log_size(void)
 {
        if (kvm_x86_ops.cpu_dirty_log_size)
-               return kvm_x86_ops.cpu_dirty_log_size();
+               return static_call(kvm_x86_cpu_dirty_log_size)();
 
        return 0;
 }
@@ -1309,7 +1310,7 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
                write_protected |= __rmap_write_protect(kvm, rmap_head, true);
        }
 
-       if (kvm->arch.tdp_mmu_enabled)
+       if (is_tdp_mmu_enabled(kvm))
                write_protected |=
                        kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn);
 
@@ -1331,7 +1332,7 @@ static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
        bool flush = false;
 
        while ((sptep = rmap_get_first(rmap_head, &iter))) {
-               rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep);
+               rmap_printk("spte %p %llx.\n", sptep, *sptep);
 
                pte_list_remove(rmap_head, sptep);
                flush = true;
@@ -1363,7 +1364,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
 
 restart:
        for_each_rmap_spte(rmap_head, &iter, sptep) {
-               rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n",
+               rmap_printk("spte %p %llx gfn %llx (%d)\n",
                            sptep, *sptep, gfn, level);
 
                need_flush = 1;
@@ -1456,16 +1457,17 @@ static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
             slot_rmap_walk_okay(_iter_);                               \
             slot_rmap_walk_next(_iter_))
 
-static int kvm_handle_hva_range(struct kvm *kvm,
-                               unsigned long start,
-                               unsigned long end,
-                               unsigned long data,
-                               int (*handler)(struct kvm *kvm,
-                                              struct kvm_rmap_head *rmap_head,
-                                              struct kvm_memory_slot *slot,
-                                              gfn_t gfn,
-                                              int level,
-                                              unsigned long data))
+static __always_inline int
+kvm_handle_hva_range(struct kvm *kvm,
+                    unsigned long start,
+                    unsigned long end,
+                    unsigned long data,
+                    int (*handler)(struct kvm *kvm,
+                                   struct kvm_rmap_head *rmap_head,
+                                   struct kvm_memory_slot *slot,
+                                   gfn_t gfn,
+                                   int level,
+                                   unsigned long data))
 {
        struct kvm_memslots *slots;
        struct kvm_memory_slot *memslot;
@@ -1521,7 +1523,7 @@ int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
 
        r = kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
 
-       if (kvm->arch.tdp_mmu_enabled)
+       if (is_tdp_mmu_enabled(kvm))
                r |= kvm_tdp_mmu_zap_hva_range(kvm, start, end);
 
        return r;
@@ -1533,7 +1535,7 @@ int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
 
        r = kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
 
-       if (kvm->arch.tdp_mmu_enabled)
+       if (is_tdp_mmu_enabled(kvm))
                r |= kvm_tdp_mmu_set_spte_hva(kvm, hva, &pte);
 
        return r;
@@ -1588,7 +1590,7 @@ int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
        int young = false;
 
        young = kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp);
-       if (kvm->arch.tdp_mmu_enabled)
+       if (is_tdp_mmu_enabled(kvm))
                young |= kvm_tdp_mmu_age_hva_range(kvm, start, end);
 
        return young;
@@ -1599,7 +1601,7 @@ int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
        int young = false;
 
        young = kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
-       if (kvm->arch.tdp_mmu_enabled)
+       if (is_tdp_mmu_enabled(kvm))
                young |= kvm_tdp_mmu_test_age_hva(kvm, hva);
 
        return young;
@@ -1723,13 +1725,6 @@ static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
        return 0;
 }
 
-static void nonpaging_update_pte(struct kvm_vcpu *vcpu,
-                                struct kvm_mmu_page *sp, u64 *spte,
-                                const void *pte)
-{
-       WARN_ON(1);
-}
-
 #define KVM_PAGE_ARRAY_NR 16
 
 struct kvm_mmu_pages {
@@ -2016,9 +2011,9 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
                        flush |= kvm_sync_page(vcpu, sp, &invalid_list);
                        mmu_pages_clear_parents(&parents);
                }
-               if (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)) {
+               if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) {
                        kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
-                       cond_resched_lock(&vcpu->kvm->mmu_lock);
+                       cond_resched_rwlock_write(&vcpu->kvm->mmu_lock);
                        flush = false;
                }
        }
@@ -2417,7 +2412,7 @@ static unsigned long kvm_mmu_zap_oldest_mmu_pages(struct kvm *kvm,
                return 0;
 
 restart:
-       list_for_each_entry_safe(sp, tmp, &kvm->arch.active_mmu_pages, link) {
+       list_for_each_entry_safe_reverse(sp, tmp, &kvm->arch.active_mmu_pages, link) {
                /*
                 * Don't zap active root pages, the page itself can't be freed
                 * and zapping it will just force vCPUs to realloc and reload.
@@ -2470,7 +2465,7 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
  */
 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
 {
-       spin_lock(&kvm->mmu_lock);
+       write_lock(&kvm->mmu_lock);
 
        if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
                kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages -
@@ -2481,7 +2476,7 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
 
        kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
 
-       spin_unlock(&kvm->mmu_lock);
+       write_unlock(&kvm->mmu_lock);
 }
 
 int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
@@ -2492,7 +2487,7 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 
        pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
        r = 0;
-       spin_lock(&kvm->mmu_lock);
+       write_lock(&kvm->mmu_lock);
        for_each_gfn_indirect_valid_sp(kvm, sp, gfn) {
                pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
                         sp->role.word);
@@ -2500,7 +2495,7 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
                kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
        }
        kvm_mmu_commit_zap_page(kvm, &invalid_list);
-       spin_unlock(&kvm->mmu_lock);
+       write_unlock(&kvm->mmu_lock);
 
        return r;
 }
@@ -3161,7 +3156,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
        sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
 
        if (kvm_mmu_put_root(kvm, sp)) {
-               if (sp->tdp_mmu_page)
+               if (is_tdp_mmu_page(sp))
                        kvm_tdp_mmu_free_root(kvm, sp);
                else if (sp->role.invalid)
                        kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
@@ -3192,7 +3187,7 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
                        return;
        }
 
-       spin_lock(&kvm->mmu_lock);
+       write_lock(&kvm->mmu_lock);
 
        for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
                if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
@@ -3215,7 +3210,7 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
        }
 
        kvm_mmu_commit_zap_page(kvm, &invalid_list);
-       spin_unlock(&kvm->mmu_lock);
+       write_unlock(&kvm->mmu_lock);
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
 
@@ -3236,16 +3231,16 @@ static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva,
 {
        struct kvm_mmu_page *sp;
 
-       spin_lock(&vcpu->kvm->mmu_lock);
+       write_lock(&vcpu->kvm->mmu_lock);
 
        if (make_mmu_pages_available(vcpu)) {
-               spin_unlock(&vcpu->kvm->mmu_lock);
+               write_unlock(&vcpu->kvm->mmu_lock);
                return INVALID_PAGE;
        }
        sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL);
        ++sp->root_count;
 
-       spin_unlock(&vcpu->kvm->mmu_lock);
+       write_unlock(&vcpu->kvm->mmu_lock);
        return __pa(sp->spt);
 }
 
@@ -3255,7 +3250,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
        hpa_t root;
        unsigned i;
 
-       if (vcpu->kvm->arch.tdp_mmu_enabled) {
+       if (is_tdp_mmu_enabled(vcpu->kvm)) {
                root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
 
                if (!VALID_PAGE(root))
@@ -3416,17 +3411,17 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
                    !smp_load_acquire(&sp->unsync_children))
                        return;
 
-               spin_lock(&vcpu->kvm->mmu_lock);
+               write_lock(&vcpu->kvm->mmu_lock);
                kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
 
                mmu_sync_children(vcpu, sp);
 
                kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
-               spin_unlock(&vcpu->kvm->mmu_lock);
+               write_unlock(&vcpu->kvm->mmu_lock);
                return;
        }
 
-       spin_lock(&vcpu->kvm->mmu_lock);
+       write_lock(&vcpu->kvm->mmu_lock);
        kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
 
        for (i = 0; i < 4; ++i) {
@@ -3440,7 +3435,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
        }
 
        kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
-       spin_unlock(&vcpu->kvm->mmu_lock);
+       write_unlock(&vcpu->kvm->mmu_lock);
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
 
@@ -3724,7 +3719,12 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
                return r;
 
        r = RET_PF_RETRY;
-       spin_lock(&vcpu->kvm->mmu_lock);
+
+       if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))
+               read_lock(&vcpu->kvm->mmu_lock);
+       else
+               write_lock(&vcpu->kvm->mmu_lock);
+
        if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
                goto out_unlock;
        r = make_mmu_pages_available(vcpu);
@@ -3739,7 +3739,10 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
                                 prefault, is_tdp);
 
 out_unlock:
-       spin_unlock(&vcpu->kvm->mmu_lock);
+       if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))
+               read_unlock(&vcpu->kvm->mmu_lock);
+       else
+               write_unlock(&vcpu->kvm->mmu_lock);
        kvm_release_pfn_clean(pfn);
        return r;
 }
@@ -3813,7 +3816,6 @@ static void nonpaging_init_context(struct kvm_vcpu *vcpu,
        context->gva_to_gpa = nonpaging_gva_to_gpa;
        context->sync_page = nonpaging_sync_page;
        context->invlpg = NULL;
-       context->update_pte = nonpaging_update_pte;
        context->root_level = 0;
        context->shadow_root_level = PT32E_ROOT_LEVEL;
        context->direct_map = true;
@@ -3984,20 +3986,27 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu,
 static void
 __reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
                        struct rsvd_bits_validate *rsvd_check,
-                       int maxphyaddr, int level, bool nx, bool gbpages,
+                       u64 pa_bits_rsvd, int level, bool nx, bool gbpages,
                        bool pse, bool amd)
 {
-       u64 exb_bit_rsvd = 0;
        u64 gbpages_bit_rsvd = 0;
        u64 nonleaf_bit8_rsvd = 0;
+       u64 high_bits_rsvd;
 
        rsvd_check->bad_mt_xwr = 0;
 
-       if (!nx)
-               exb_bit_rsvd = rsvd_bits(63, 63);
        if (!gbpages)
                gbpages_bit_rsvd = rsvd_bits(7, 7);
 
+       if (level == PT32E_ROOT_LEVEL)
+               high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 62);
+       else
+               high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51);
+
+       /* Note, NX doesn't exist in PDPTEs, this is handled below. */
+       if (!nx)
+               high_bits_rsvd |= rsvd_bits(63, 63);
+
        /*
         * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for
         * leaf entries) on AMD CPUs only.
@@ -4026,45 +4035,39 @@ __reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
                        rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
                break;
        case PT32E_ROOT_LEVEL:
-               rsvd_check->rsvd_bits_mask[0][2] =
-                       rsvd_bits(maxphyaddr, 63) |
-                       rsvd_bits(5, 8) | rsvd_bits(1, 2);      /* PDPTE */
-               rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
-                       rsvd_bits(maxphyaddr, 62);      /* PDE */
-               rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd |
-                       rsvd_bits(maxphyaddr, 62);      /* PTE */
-               rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
-                       rsvd_bits(maxphyaddr, 62) |
-                       rsvd_bits(13, 20);              /* large page */
+               rsvd_check->rsvd_bits_mask[0][2] = rsvd_bits(63, 63) |
+                                                  high_bits_rsvd |
+                                                  rsvd_bits(5, 8) |
+                                                  rsvd_bits(1, 2);     /* PDPTE */
+               rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd;      /* PDE */
+               rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;      /* PTE */
+               rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd |
+                                                  rsvd_bits(13, 20);   /* large page */
                rsvd_check->rsvd_bits_mask[1][0] =
                        rsvd_check->rsvd_bits_mask[0][0];
                break;
        case PT64_ROOT_5LEVEL:
-               rsvd_check->rsvd_bits_mask[0][4] = exb_bit_rsvd |
-                       nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
-                       rsvd_bits(maxphyaddr, 51);
+               rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd |
+                                                  nonleaf_bit8_rsvd |
+                                                  rsvd_bits(7, 7);
                rsvd_check->rsvd_bits_mask[1][4] =
                        rsvd_check->rsvd_bits_mask[0][4];
                fallthrough;
        case PT64_ROOT_4LEVEL:
-               rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd |
-                       nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
-                       rsvd_bits(maxphyaddr, 51);
-               rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd |
-                       gbpages_bit_rsvd |
-                       rsvd_bits(maxphyaddr, 51);
-               rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
-                       rsvd_bits(maxphyaddr, 51);
-               rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd |
-                       rsvd_bits(maxphyaddr, 51);
+               rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd |
+                                                  nonleaf_bit8_rsvd |
+                                                  rsvd_bits(7, 7);
+               rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd |
+                                                  gbpages_bit_rsvd;
+               rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd;
+               rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;
                rsvd_check->rsvd_bits_mask[1][3] =
                        rsvd_check->rsvd_bits_mask[0][3];
-               rsvd_check->rsvd_bits_mask[1][2] = exb_bit_rsvd |
-                       gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51) |
-                       rsvd_bits(13, 29);
-               rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
-                       rsvd_bits(maxphyaddr, 51) |
-                       rsvd_bits(13, 20);              /* large page */
+               rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd |
+                                                  gbpages_bit_rsvd |
+                                                  rsvd_bits(13, 29);
+               rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd |
+                                                  rsvd_bits(13, 20); /* large page */
                rsvd_check->rsvd_bits_mask[1][0] =
                        rsvd_check->rsvd_bits_mask[0][0];
                break;
@@ -4075,8 +4078,8 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
                                  struct kvm_mmu *context)
 {
        __reset_rsvds_bits_mask(vcpu, &context->guest_rsvd_check,
-                               cpuid_maxphyaddr(vcpu), context->root_level,
-                               context->nx,
+                               vcpu->arch.reserved_gpa_bits,
+                               context->root_level, context->nx,
                                guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
                                is_pse(vcpu),
                                guest_cpuid_is_amd_or_hygon(vcpu));
@@ -4084,27 +4087,22 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
 
 static void
 __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
-                           int maxphyaddr, bool execonly)
+                           u64 pa_bits_rsvd, bool execonly)
 {
+       u64 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51);
        u64 bad_mt_xwr;
 
-       rsvd_check->rsvd_bits_mask[0][4] =
-               rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
-       rsvd_check->rsvd_bits_mask[0][3] =
-               rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
-       rsvd_check->rsvd_bits_mask[0][2] =
-               rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
-       rsvd_check->rsvd_bits_mask[0][1] =
-               rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
-       rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
+       rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | rsvd_bits(3, 7);
+       rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | rsvd_bits(3, 7);
+       rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6);
+       rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6);
+       rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;
 
        /* large page */
        rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];
        rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
-       rsvd_check->rsvd_bits_mask[1][2] =
-               rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29);
-       rsvd_check->rsvd_bits_mask[1][1] =
-               rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
+       rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29);
+       rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20);
        rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
 
        bad_mt_xwr = 0xFFull << (2 * 8);        /* bits 3..5 must not be 2 */
@@ -4123,7 +4121,12 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
                struct kvm_mmu *context, bool execonly)
 {
        __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
-                                   cpuid_maxphyaddr(vcpu), execonly);
+                                   vcpu->arch.reserved_gpa_bits, execonly);
+}
+
+static inline u64 reserved_hpa_bits(void)
+{
+       return rsvd_bits(shadow_phys_bits, 63);
 }
 
 /*
@@ -4145,7 +4148,7 @@ reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
         */
        shadow_zero_check = &context->shadow_zero_check;
        __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
-                               shadow_phys_bits,
+                               reserved_hpa_bits(),
                                context->shadow_root_level, uses_nx,
                                guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
                                is_pse(vcpu), true);
@@ -4182,14 +4185,13 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
 
        if (boot_cpu_is_amd())
                __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
-                                       shadow_phys_bits,
+                                       reserved_hpa_bits(),
                                        context->shadow_root_level, false,
                                        boot_cpu_has(X86_FEATURE_GBPAGES),
                                        true, true);
        else
                __reset_rsvds_bits_mask_ept(shadow_zero_check,
-                                           shadow_phys_bits,
-                                           false);
+                                           reserved_hpa_bits(), false);
 
        if (!shadow_me_mask)
                return;
@@ -4209,7 +4211,7 @@ reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
                                struct kvm_mmu *context, bool execonly)
 {
        __reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
-                                   shadow_phys_bits, execonly);
+                                   reserved_hpa_bits(), execonly);
 }
 
 #define BYTE_MASK(access) \
@@ -4395,7 +4397,6 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu,
        context->gva_to_gpa = paging64_gva_to_gpa;
        context->sync_page = paging64_sync_page;
        context->invlpg = paging64_invlpg;
-       context->update_pte = paging64_update_pte;
        context->shadow_root_level = level;
        context->direct_map = false;
 }
@@ -4424,7 +4425,6 @@ static void paging32_init_context(struct kvm_vcpu *vcpu,
        context->gva_to_gpa = paging32_gva_to_gpa;
        context->sync_page = paging32_sync_page;
        context->invlpg = paging32_invlpg;
-       context->update_pte = paging32_update_pte;
        context->shadow_root_level = PT32E_ROOT_LEVEL;
        context->direct_map = false;
 }
@@ -4506,7 +4506,6 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
        context->page_fault = kvm_tdp_page_fault;
        context->sync_page = nonpaging_sync_page;
        context->invlpg = NULL;
-       context->update_pte = nonpaging_update_pte;
        context->shadow_root_level = kvm_mmu_get_tdp_level(vcpu);
        context->direct_map = true;
        context->get_guest_pgd = get_cr3;
@@ -4678,7 +4677,6 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
        context->gva_to_gpa = ept_gva_to_gpa;
        context->sync_page = ept_sync_page;
        context->invlpg = ept_invlpg;
-       context->update_pte = ept_update_pte;
        context->root_level = level;
        context->direct_map = false;
        context->mmu_role.as_u64 = new_role.as_u64;
@@ -4811,7 +4809,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
        if (r)
                goto out;
        kvm_mmu_load_pgd(vcpu);
-       kvm_x86_ops.tlb_flush_current(vcpu);
+       static_call(kvm_x86_tlb_flush_current)(vcpu);
 out:
        return r;
 }
@@ -4826,19 +4824,6 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_unload);
 
-static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
-                                 struct kvm_mmu_page *sp, u64 *spte,
-                                 const void *new)
-{
-       if (sp->role.level != PG_LEVEL_4K) {
-               ++vcpu->kvm->stat.mmu_pde_zapped;
-               return;
-        }
-
-       ++vcpu->kvm->stat.mmu_pte_updated;
-       vcpu->arch.mmu->update_pte(vcpu, sp, spte, new);
-}
-
 static bool need_remote_flush(u64 old, u64 new)
 {
        if (!is_shadow_present_pte(old))
@@ -4954,22 +4939,6 @@ static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
        return spte;
 }
 
-/*
- * Ignore various flags when determining if a SPTE can be immediately
- * overwritten for the current MMU.
- *  - level: explicitly checked in mmu_pte_write_new_pte(), and will never
- *    match the current MMU role, as MMU's level tracks the root level.
- *  - access: updated based on the new guest PTE
- *  - quadrant: handled by get_written_sptes()
- *  - invalid: always false (loop only walks valid shadow pages)
- */
-static const union kvm_mmu_page_role role_ign = {
-       .level = 0xf,
-       .access = 0x7,
-       .quadrant = 0x3,
-       .invalid = 0x1,
-};
-
 static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
                              const u8 *new, int bytes,
                              struct kvm_page_track_notifier_node *node)
@@ -4999,7 +4968,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
         */
        mmu_topup_memory_caches(vcpu, true);
 
-       spin_lock(&vcpu->kvm->mmu_lock);
+       write_lock(&vcpu->kvm->mmu_lock);
 
        gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
 
@@ -5020,14 +4989,10 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 
                local_flush = true;
                while (npte--) {
-                       u32 base_role = vcpu->arch.mmu->mmu_role.base.word;
-
                        entry = *spte;
                        mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL);
-                       if (gentry &&
-                           !((sp->role.word ^ base_role) & ~role_ign.word) &&
-                           rmap_can_add(vcpu))
-                               mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
+                       if (gentry && sp->role.level != PG_LEVEL_4K)
+                               ++vcpu->kvm->stat.mmu_pde_zapped;
                        if (need_remote_flush(entry, *spte))
                                remote_flush = true;
                        ++spte;
@@ -5035,7 +5000,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
        }
        kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush);
        kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
-       spin_unlock(&vcpu->kvm->mmu_lock);
+       write_unlock(&vcpu->kvm->mmu_lock);
 }
 
 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
@@ -5125,7 +5090,7 @@ void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
                if (is_noncanonical_address(gva, vcpu))
                        return;
 
-               kvm_x86_ops.tlb_flush_gva(vcpu, gva);
+               static_call(kvm_x86_tlb_flush_gva)(vcpu, gva);
        }
 
        if (!mmu->invlpg)
@@ -5182,7 +5147,7 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
        }
 
        if (tlb_flush)
-               kvm_x86_ops.tlb_flush_gva(vcpu, gva);
+               static_call(kvm_x86_tlb_flush_gva)(vcpu, gva);
 
        ++vcpu->stat.invlpg;
 
@@ -5233,14 +5198,14 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
                if (iterator.rmap)
                        flush |= fn(kvm, iterator.rmap);
 
-               if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+               if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
                        if (flush && lock_flush_tlb) {
                                kvm_flush_remote_tlbs_with_address(kvm,
                                                start_gfn,
                                                iterator.gfn - start_gfn + 1);
                                flush = false;
                        }
-                       cond_resched_lock(&kvm->mmu_lock);
+                       cond_resched_rwlock_write(&kvm->mmu_lock);
                }
        }
 
@@ -5390,7 +5355,7 @@ restart:
                 * be in active use by the guest.
                 */
                if (batch >= BATCH_ZAP_PAGES &&
-                   cond_resched_lock(&kvm->mmu_lock)) {
+                   cond_resched_rwlock_write(&kvm->mmu_lock)) {
                        batch = 0;
                        goto restart;
                }
@@ -5423,7 +5388,7 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
 {
        lockdep_assert_held(&kvm->slots_lock);
 
-       spin_lock(&kvm->mmu_lock);
+       write_lock(&kvm->mmu_lock);
        trace_kvm_mmu_zap_all_fast(kvm);
 
        /*
@@ -5447,10 +5412,10 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
 
        kvm_zap_obsolete_pages(kvm);
 
-       if (kvm->arch.tdp_mmu_enabled)
+       if (is_tdp_mmu_enabled(kvm))
                kvm_tdp_mmu_zap_all(kvm);
 
-       spin_unlock(&kvm->mmu_lock);
+       write_unlock(&kvm->mmu_lock);
 }
 
 static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
@@ -5492,7 +5457,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
        int i;
        bool flush;
 
-       spin_lock(&kvm->mmu_lock);
+       write_lock(&kvm->mmu_lock);
        for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
                slots = __kvm_memslots(kvm, i);
                kvm_for_each_memslot(memslot, slots) {
@@ -5510,13 +5475,13 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
                }
        }
 
-       if (kvm->arch.tdp_mmu_enabled) {
+       if (is_tdp_mmu_enabled(kvm)) {
                flush = kvm_tdp_mmu_zap_gfn_range(kvm, gfn_start, gfn_end);
                if (flush)
                        kvm_flush_remote_tlbs(kvm);
        }
 
-       spin_unlock(&kvm->mmu_lock);
+       write_unlock(&kvm->mmu_lock);
 }
 
 static bool slot_rmap_write_protect(struct kvm *kvm,
@@ -5531,12 +5496,12 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
 {
        bool flush;
 
-       spin_lock(&kvm->mmu_lock);
+       write_lock(&kvm->mmu_lock);
        flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect,
                                start_level, KVM_MAX_HUGEPAGE_LEVEL, false);
-       if (kvm->arch.tdp_mmu_enabled)
+       if (is_tdp_mmu_enabled(kvm))
                flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_4K);
-       spin_unlock(&kvm->mmu_lock);
+       write_unlock(&kvm->mmu_lock);
 
        /*
         * We can flush all the TLBs out of the mmu lock without TLB
@@ -5596,13 +5561,13 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
                                   const struct kvm_memory_slot *memslot)
 {
        /* FIXME: const-ify all uses of struct kvm_memory_slot.  */
-       spin_lock(&kvm->mmu_lock);
+       write_lock(&kvm->mmu_lock);
        slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot,
                         kvm_mmu_zap_collapsible_spte, true);
 
-       if (kvm->arch.tdp_mmu_enabled)
+       if (is_tdp_mmu_enabled(kvm))
                kvm_tdp_mmu_zap_collapsible_sptes(kvm, memslot);
-       spin_unlock(&kvm->mmu_lock);
+       write_unlock(&kvm->mmu_lock);
 }
 
 void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
@@ -5625,11 +5590,11 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
 {
        bool flush;
 
-       spin_lock(&kvm->mmu_lock);
+       write_lock(&kvm->mmu_lock);
        flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
-       if (kvm->arch.tdp_mmu_enabled)
+       if (is_tdp_mmu_enabled(kvm))
                flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
-       spin_unlock(&kvm->mmu_lock);
+       write_unlock(&kvm->mmu_lock);
 
        /*
         * It's also safe to flush TLBs out of mmu lock here as currently this
@@ -5647,12 +5612,12 @@ void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
 {
        bool flush;
 
-       spin_lock(&kvm->mmu_lock);
+       write_lock(&kvm->mmu_lock);
        flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect,
                                        false);
-       if (kvm->arch.tdp_mmu_enabled)
+       if (is_tdp_mmu_enabled(kvm))
                flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_2M);
-       spin_unlock(&kvm->mmu_lock);
+       write_unlock(&kvm->mmu_lock);
 
        if (flush)
                kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
@@ -5664,11 +5629,11 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
 {
        bool flush;
 
-       spin_lock(&kvm->mmu_lock);
+       write_lock(&kvm->mmu_lock);
        flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false);
-       if (kvm->arch.tdp_mmu_enabled)
+       if (is_tdp_mmu_enabled(kvm))
                flush |= kvm_tdp_mmu_slot_set_dirty(kvm, memslot);
-       spin_unlock(&kvm->mmu_lock);
+       write_unlock(&kvm->mmu_lock);
 
        if (flush)
                kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
@@ -5681,23 +5646,23 @@ void kvm_mmu_zap_all(struct kvm *kvm)
        LIST_HEAD(invalid_list);
        int ign;
 
-       spin_lock(&kvm->mmu_lock);
+       write_lock(&kvm->mmu_lock);
 restart:
        list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
                if (WARN_ON(sp->role.invalid))
                        continue;
                if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
                        goto restart;
-               if (cond_resched_lock(&kvm->mmu_lock))
+               if (cond_resched_rwlock_write(&kvm->mmu_lock))
                        goto restart;
        }
 
        kvm_mmu_commit_zap_page(kvm, &invalid_list);
 
-       if (kvm->arch.tdp_mmu_enabled)
+       if (is_tdp_mmu_enabled(kvm))
                kvm_tdp_mmu_zap_all(kvm);
 
-       spin_unlock(&kvm->mmu_lock);
+       write_unlock(&kvm->mmu_lock);
 }
 
 void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
@@ -5757,7 +5722,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
                        continue;
 
                idx = srcu_read_lock(&kvm->srcu);
-               spin_lock(&kvm->mmu_lock);
+               write_lock(&kvm->mmu_lock);
 
                if (kvm_has_zapped_obsolete_pages(kvm)) {
                        kvm_mmu_commit_zap_page(kvm,
@@ -5768,7 +5733,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
                freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan);
 
 unlock:
-               spin_unlock(&kvm->mmu_lock);
+               write_unlock(&kvm->mmu_lock);
                srcu_read_unlock(&kvm->srcu, idx);
 
                /*
@@ -5988,7 +5953,7 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
        ulong to_zap;
 
        rcu_idx = srcu_read_lock(&kvm->srcu);
-       spin_lock(&kvm->mmu_lock);
+       write_lock(&kvm->mmu_lock);
 
        ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
        to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0;
@@ -6005,22 +5970,22 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
                                      struct kvm_mmu_page,
                                      lpage_disallowed_link);
                WARN_ON_ONCE(!sp->lpage_disallowed);
-               if (sp->tdp_mmu_page)
+               if (is_tdp_mmu_page(sp)) {
                        kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn,
                                sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level));
-               else {
+               else {
                        kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
                        WARN_ON_ONCE(sp->lpage_disallowed);
                }
 
-               if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+               if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
                        kvm_mmu_commit_zap_page(kvm, &invalid_list);
-                       cond_resched_lock(&kvm->mmu_lock);
+                       cond_resched_rwlock_write(&kvm->mmu_lock);
                }
        }
        kvm_mmu_commit_zap_page(kvm, &invalid_list);
 
-       spin_unlock(&kvm->mmu_lock);
+       write_unlock(&kvm->mmu_lock);
        srcu_read_unlock(&kvm->srcu, rcu_idx);
 }
 
index c8d51a3..ced15fd 100644 (file)
@@ -234,7 +234,7 @@ static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
 }
 
 static bool mmu_audit;
-static struct static_key mmu_audit_key;
+static DEFINE_STATIC_KEY_FALSE(mmu_audit_key);
 
 static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point)
 {
@@ -250,7 +250,7 @@ static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point)
 
 static inline void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point)
 {
-       if (static_key_false((&mmu_audit_key)))
+       if (static_branch_unlikely((&mmu_audit_key)))
                __kvm_mmu_audit(vcpu, point);
 }
 
@@ -259,7 +259,7 @@ static void mmu_audit_enable(void)
        if (mmu_audit)
                return;
 
-       static_key_slow_inc(&mmu_audit_key);
+       static_branch_inc(&mmu_audit_key);
        mmu_audit = true;
 }
 
@@ -268,7 +268,7 @@ static void mmu_audit_disable(void)
        if (!mmu_audit)
                return;
 
-       static_key_slow_dec(&mmu_audit_key);
+       static_branch_dec(&mmu_audit_key);
        mmu_audit = false;
 }
 
index bfc6389..9e38d3c 100644 (file)
@@ -12,7 +12,7 @@
 extern bool dbg;
 
 #define pgprintk(x...) do { if (dbg) printk(x); } while (0)
-#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
+#define rmap_printk(fmt, args...) do { if (dbg) printk("%s: " fmt, __func__, ## args); } while (0)
 #define MMU_WARN_ON(x) WARN_ON(x)
 #else
 #define pgprintk(x...) do { } while (0)
@@ -56,7 +56,12 @@ struct kvm_mmu_page {
        /* Number of writes since the last time traversal visited this page.  */
        atomic_t write_flooding_count;
 
+#ifdef CONFIG_X86_64
        bool tdp_mmu_page;
+
+       /* Used for freeing the page asyncronously if it is a TDP MMU page. */
+       struct rcu_head rcu_head;
+#endif
 };
 
 extern struct kmem_cache *mmu_page_header_cache;
index 8443a67..34bb0ec 100644 (file)
@@ -184,9 +184,9 @@ kvm_page_track_register_notifier(struct kvm *kvm,
 
        head = &kvm->arch.track_notifier_head;
 
-       spin_lock(&kvm->mmu_lock);
+       write_lock(&kvm->mmu_lock);
        hlist_add_head_rcu(&n->node, &head->track_notifier_list);
-       spin_unlock(&kvm->mmu_lock);
+       write_unlock(&kvm->mmu_lock);
 }
 EXPORT_SYMBOL_GPL(kvm_page_track_register_notifier);
 
@@ -202,9 +202,9 @@ kvm_page_track_unregister_notifier(struct kvm *kvm,
 
        head = &kvm->arch.track_notifier_head;
 
-       spin_lock(&kvm->mmu_lock);
+       write_lock(&kvm->mmu_lock);
        hlist_del_rcu(&n->node);
-       spin_unlock(&kvm->mmu_lock);
+       write_unlock(&kvm->mmu_lock);
        synchronize_srcu(&head->track_srcu);
 }
 EXPORT_SYMBOL_GPL(kvm_page_track_unregister_notifier);
index 50e268e..d9f66cc 100644 (file)
@@ -868,7 +868,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
        }
 
        r = RET_PF_RETRY;
-       spin_lock(&vcpu->kvm->mmu_lock);
+       write_lock(&vcpu->kvm->mmu_lock);
        if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
                goto out_unlock;
 
@@ -881,7 +881,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
        kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
 
 out_unlock:
-       spin_unlock(&vcpu->kvm->mmu_lock);
+       write_unlock(&vcpu->kvm->mmu_lock);
        kvm_release_pfn_clean(pfn);
        return r;
 }
@@ -919,7 +919,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
                return;
        }
 
-       spin_lock(&vcpu->kvm->mmu_lock);
+       write_lock(&vcpu->kvm->mmu_lock);
        for_each_shadow_entry_using_root(vcpu, root_hpa, gva, iterator) {
                level = iterator.level;
                sptep = iterator.sptep;
@@ -954,7 +954,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
                if (!is_shadow_present_pte(*sptep) || !sp->unsync_children)
                        break;
        }
-       spin_unlock(&vcpu->kvm->mmu_lock);
+       write_unlock(&vcpu->kvm->mmu_lock);
 }
 
 /* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */
index c51ad54..ef55f0b 100644 (file)
@@ -120,7 +120,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
        if (level > PG_LEVEL_4K)
                spte |= PT_PAGE_SIZE_MASK;
        if (tdp_enabled)
-               spte |= kvm_x86_ops.get_mt_mask(vcpu, gfn,
+               spte |= static_call(kvm_x86_get_mt_mask)(vcpu, gfn,
                        kvm_is_mmio_pfn(pfn));
 
        if (host_writable)
index 2b3a30b..6de3950 100644 (file)
@@ -130,6 +130,25 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
                                          PT64_EPT_EXECUTABLE_MASK)
 #define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT PT64_SECOND_AVAIL_BITS_SHIFT
 
+/*
+ * If a thread running without exclusive control of the MMU lock must perform a
+ * multi-part operation on an SPTE, it can set the SPTE to REMOVED_SPTE as a
+ * non-present intermediate value. Other threads which encounter this value
+ * should not modify the SPTE.
+ *
+ * This constant works because it is considered non-present on both AMD and
+ * Intel CPUs and does not create a L1TF vulnerability because the pfn section
+ * is zeroed out.
+ *
+ * Only used by the TDP MMU.
+ */
+#define REMOVED_SPTE (1ull << 59)
+
+static inline bool is_removed_spte(u64 spte)
+{
+       return spte == REMOVED_SPTE;
+}
+
 /*
  * In some cases, we need to preserve the GFN of a non-present or reserved
  * SPTE when we usurp the upper five bits of the physical address space to
@@ -185,23 +204,19 @@ static inline bool is_access_track_spte(u64 spte)
        return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;
 }
 
-static inline int is_shadow_present_pte(u64 pte)
+static inline bool is_shadow_present_pte(u64 pte)
 {
-       return (pte != 0) && !is_mmio_spte(pte);
+       return (pte != 0) && !is_mmio_spte(pte) && !is_removed_spte(pte);
 }
 
-static inline int is_large_pte(u64 pte)
+static inline bool is_large_pte(u64 pte)
 {
        return pte & PT_PAGE_SIZE_MASK;
 }
 
-static inline int is_last_spte(u64 pte, int level)
+static inline bool is_last_spte(u64 pte, int level)
 {
-       if (level == PG_LEVEL_4K)
-               return 1;
-       if (is_large_pte(pte))
-               return 1;
-       return 0;
+       return (level == PG_LEVEL_4K) || is_large_pte(pte);
 }
 
 static inline bool is_executable_pte(u64 spte)
index 87b7e16..e5f1481 100644 (file)
@@ -12,7 +12,7 @@ static void tdp_iter_refresh_sptep(struct tdp_iter *iter)
 {
        iter->sptep = iter->pt_path[iter->level - 1] +
                SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level);
-       iter->old_spte = READ_ONCE(*iter->sptep);
+       iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep));
 }
 
 static gfn_t round_gfn_for_level(gfn_t gfn, int level)
@@ -22,21 +22,22 @@ static gfn_t round_gfn_for_level(gfn_t gfn, int level)
 
 /*
  * Sets a TDP iterator to walk a pre-order traversal of the paging structure
- * rooted at root_pt, starting with the walk to translate goal_gfn.
+ * rooted at root_pt, starting with the walk to translate next_last_level_gfn.
  */
 void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
-                   int min_level, gfn_t goal_gfn)
+                   int min_level, gfn_t next_last_level_gfn)
 {
        WARN_ON(root_level < 1);
        WARN_ON(root_level > PT64_ROOT_MAX_LEVEL);
 
-       iter->goal_gfn = goal_gfn;
+       iter->next_last_level_gfn = next_last_level_gfn;
+       iter->yielded_gfn = iter->next_last_level_gfn;
        iter->root_level = root_level;
        iter->min_level = min_level;
        iter->level = root_level;
-       iter->pt_path[iter->level - 1] = root_pt;
+       iter->pt_path[iter->level - 1] = (tdp_ptep_t)root_pt;
 
-       iter->gfn = round_gfn_for_level(iter->goal_gfn, iter->level);
+       iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level);
        tdp_iter_refresh_sptep(iter);
 
        iter->valid = true;
@@ -47,7 +48,7 @@ void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
  * address of the child page table referenced by the SPTE. Returns null if
  * there is no such entry.
  */
-u64 *spte_to_child_pt(u64 spte, int level)
+tdp_ptep_t spte_to_child_pt(u64 spte, int level)
 {
        /*
         * There's no child entry if this entry isn't present or is a
@@ -56,7 +57,7 @@ u64 *spte_to_child_pt(u64 spte, int level)
        if (!is_shadow_present_pte(spte) || is_last_spte(spte, level))
                return NULL;
 
-       return __va(spte_to_pfn(spte) << PAGE_SHIFT);
+       return (tdp_ptep_t)__va(spte_to_pfn(spte) << PAGE_SHIFT);
 }
 
 /*
@@ -65,7 +66,7 @@ u64 *spte_to_child_pt(u64 spte, int level)
  */
 static bool try_step_down(struct tdp_iter *iter)
 {
-       u64 *child_pt;
+       tdp_ptep_t child_pt;
 
        if (iter->level == iter->min_level)
                return false;
@@ -74,7 +75,7 @@ static bool try_step_down(struct tdp_iter *iter)
         * Reread the SPTE before stepping down to avoid traversing into page
         * tables that are no longer linked from this entry.
         */
-       iter->old_spte = READ_ONCE(*iter->sptep);
+       iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep));
 
        child_pt = spte_to_child_pt(iter->old_spte, iter->level);
        if (!child_pt)
@@ -82,7 +83,7 @@ static bool try_step_down(struct tdp_iter *iter)
 
        iter->level--;
        iter->pt_path[iter->level - 1] = child_pt;
-       iter->gfn = round_gfn_for_level(iter->goal_gfn, iter->level);
+       iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level);
        tdp_iter_refresh_sptep(iter);
 
        return true;
@@ -106,9 +107,9 @@ static bool try_step_side(struct tdp_iter *iter)
                return false;
 
        iter->gfn += KVM_PAGES_PER_HPAGE(iter->level);
-       iter->goal_gfn = iter->gfn;
+       iter->next_last_level_gfn = iter->gfn;
        iter->sptep++;
-       iter->old_spte = READ_ONCE(*iter->sptep);
+       iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep));
 
        return true;
 }
@@ -158,24 +159,7 @@ void tdp_iter_next(struct tdp_iter *iter)
        iter->valid = false;
 }
 
-/*
- * Restart the walk over the paging structure from the root, starting from the
- * highest gfn the iterator had previously reached. Assumes that the entire
- * paging structure, except the root page, may have been completely torn down
- * and rebuilt.
- */
-void tdp_iter_refresh_walk(struct tdp_iter *iter)
-{
-       gfn_t goal_gfn = iter->goal_gfn;
-
-       if (iter->gfn > goal_gfn)
-               goal_gfn = iter->gfn;
-
-       tdp_iter_start(iter, iter->pt_path[iter->root_level - 1],
-                      iter->root_level, iter->min_level, goal_gfn);
-}
-
-u64 *tdp_iter_root_pt(struct tdp_iter *iter)
+tdp_ptep_t tdp_iter_root_pt(struct tdp_iter *iter)
 {
        return iter->pt_path[iter->root_level - 1];
 }
index 47170d0..4cc177d 100644 (file)
@@ -7,6 +7,8 @@
 
 #include "mmu.h"
 
+typedef u64 __rcu *tdp_ptep_t;
+
 /*
  * A TDP iterator performs a pre-order walk over a TDP paging structure.
  */
@@ -15,11 +17,17 @@ struct tdp_iter {
         * The iterator will traverse the paging structure towards the mapping
         * for this GFN.
         */
-       gfn_t goal_gfn;
+       gfn_t next_last_level_gfn;
+       /*
+        * The next_last_level_gfn at the time when the thread last
+        * yielded. Only yielding when the next_last_level_gfn !=
+        * yielded_gfn helps ensure forward progress.
+        */
+       gfn_t yielded_gfn;
        /* Pointers to the page tables traversed to reach the current SPTE */
-       u64 *pt_path[PT64_ROOT_MAX_LEVEL];
+       tdp_ptep_t pt_path[PT64_ROOT_MAX_LEVEL];
        /* A pointer to the current SPTE */
-       u64 *sptep;
+       tdp_ptep_t sptep;
        /* The lowest GFN mapped by the current SPTE */
        gfn_t gfn;
        /* The level of the root page given to the iterator */
@@ -49,12 +57,11 @@ struct tdp_iter {
 #define for_each_tdp_pte(iter, root, root_level, start, end) \
        for_each_tdp_pte_min_level(iter, root, root_level, PG_LEVEL_4K, start, end)
 
-u64 *spte_to_child_pt(u64 pte, int level);
+tdp_ptep_t spte_to_child_pt(u64 pte, int level);
 
 void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
-                   int min_level, gfn_t goal_gfn);
+                   int min_level, gfn_t next_last_level_gfn);
 void tdp_iter_next(struct tdp_iter *iter);
-void tdp_iter_refresh_walk(struct tdp_iter *iter);
-u64 *tdp_iter_root_pt(struct tdp_iter *iter);
+tdp_ptep_t tdp_iter_root_pt(struct tdp_iter *iter);
 
 #endif /* __KVM_X86_MMU_TDP_ITER_H */
index 2ef8615..71e100a 100644 (file)
@@ -7,32 +7,23 @@
 #include "tdp_mmu.h"
 #include "spte.h"
 
+#include <asm/cmpxchg.h>
 #include <trace/events/kvm.h>
 
-#ifdef CONFIG_X86_64
 static bool __read_mostly tdp_mmu_enabled = false;
 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
-#endif
-
-static bool is_tdp_mmu_enabled(void)
-{
-#ifdef CONFIG_X86_64
-       return tdp_enabled && READ_ONCE(tdp_mmu_enabled);
-#else
-       return false;
-#endif /* CONFIG_X86_64 */
-}
 
 /* Initializes the TDP MMU for the VM, if enabled. */
 void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
 {
-       if (!is_tdp_mmu_enabled())
+       if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
                return;
 
        /* This should not be changed for the lifetime of the VM. */
        kvm->arch.tdp_mmu_enabled = true;
 
        INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
+       spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
        INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
 }
 
@@ -42,6 +33,12 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
                return;
 
        WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
+
+       /*
+        * Ensure that all the outstanding RCU callbacks to free shadow pages
+        * can run before the VM is torn down.
+        */
+       rcu_barrier();
 }
 
 static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
@@ -53,7 +50,7 @@ static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
 static inline bool tdp_mmu_next_root_valid(struct kvm *kvm,
                                           struct kvm_mmu_page *root)
 {
-       lockdep_assert_held(&kvm->mmu_lock);
+       lockdep_assert_held_write(&kvm->mmu_lock);
 
        if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link))
                return false;
@@ -88,22 +85,6 @@ static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
 #define for_each_tdp_mmu_root(_kvm, _root)                             \
        list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)
 
-bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa)
-{
-       struct kvm_mmu_page *sp;
-
-       if (!kvm->arch.tdp_mmu_enabled)
-               return false;
-       if (WARN_ON(!VALID_PAGE(hpa)))
-               return false;
-
-       sp = to_shadow_page(hpa);
-       if (WARN_ON(!sp))
-               return false;
-
-       return sp->tdp_mmu_page && sp->root_count;
-}
-
 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
                          gfn_t start, gfn_t end, bool can_yield);
 
@@ -111,7 +92,7 @@ void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
 {
        gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
 
-       lockdep_assert_held(&kvm->mmu_lock);
+       lockdep_assert_held_write(&kvm->mmu_lock);
 
        WARN_ON(root->root_count);
        WARN_ON(!root->tdp_mmu_page);
@@ -164,13 +145,13 @@ static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu)
 
        role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
 
-       spin_lock(&kvm->mmu_lock);
+       write_lock(&kvm->mmu_lock);
 
        /* Check for an existing root before allocating a new one. */
        for_each_tdp_mmu_root(kvm, root) {
                if (root->role.word == role.word) {
                        kvm_mmu_get_root(kvm, root);
-                       spin_unlock(&kvm->mmu_lock);
+                       write_unlock(&kvm->mmu_lock);
                        return root;
                }
        }
@@ -180,7 +161,7 @@ static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu)
 
        list_add(&root->link, &kvm->arch.tdp_mmu_roots);
 
-       spin_unlock(&kvm->mmu_lock);
+       write_unlock(&kvm->mmu_lock);
 
        return root;
 }
@@ -196,8 +177,31 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
        return __pa(root->spt);
 }
 
+static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
+{
+       free_page((unsigned long)sp->spt);
+       kmem_cache_free(mmu_page_header_cache, sp);
+}
+
+/*
+ * This is called through call_rcu in order to free TDP page table memory
+ * safely with respect to other kernel threads that may be operating on
+ * the memory.
+ * By only accessing TDP MMU page table memory in an RCU read critical
+ * section, and freeing it after a grace period, lockless access to that
+ * memory won't use it after it is freed.
+ */
+static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
+{
+       struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
+                                              rcu_head);
+
+       tdp_mmu_free_sp(sp);
+}
+
 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
-                               u64 old_spte, u64 new_spte, int level);
+                               u64 old_spte, u64 new_spte, int level,
+                               bool shared);
 
 static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
 {
@@ -234,6 +238,128 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
        }
 }
 
+/**
+ * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU
+ *
+ * @kvm: kvm instance
+ * @sp: the new page
+ * @shared: This operation may not be running under the exclusive use of
+ *         the MMU lock and the operation must synchronize with other
+ *         threads that might be adding or removing pages.
+ * @account_nx: This page replaces a NX large page and should be marked for
+ *             eventual reclaim.
+ */
+static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+                             bool shared, bool account_nx)
+{
+       if (shared)
+               spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+       else
+               lockdep_assert_held_write(&kvm->mmu_lock);
+
+       list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
+       if (account_nx)
+               account_huge_nx_page(kvm, sp);
+
+       if (shared)
+               spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+}
+
+/**
+ * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU
+ *
+ * @kvm: kvm instance
+ * @sp: the page to be removed
+ * @shared: This operation may not be running under the exclusive use of
+ *         the MMU lock and the operation must synchronize with other
+ *         threads that might be adding or removing pages.
+ */
+static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+                               bool shared)
+{
+       if (shared)
+               spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+       else
+               lockdep_assert_held_write(&kvm->mmu_lock);
+
+       list_del(&sp->link);
+       if (sp->lpage_disallowed)
+               unaccount_huge_nx_page(kvm, sp);
+
+       if (shared)
+               spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+}
+
+/**
+ * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure
+ *
+ * @kvm: kvm instance
+ * @pt: the page removed from the paging structure
+ * @shared: This operation may not be running under the exclusive use
+ *         of the MMU lock and the operation must synchronize with other
+ *         threads that might be modifying SPTEs.
+ *
+ * Given a page table that has been removed from the TDP paging structure,
+ * iterates through the page table to clear SPTEs and free child page tables.
+ */
+static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt,
+                                       bool shared)
+{
+       struct kvm_mmu_page *sp = sptep_to_sp(pt);
+       int level = sp->role.level;
+       gfn_t base_gfn = sp->gfn;
+       u64 old_child_spte;
+       u64 *sptep;
+       gfn_t gfn;
+       int i;
+
+       trace_kvm_mmu_prepare_zap_page(sp);
+
+       tdp_mmu_unlink_page(kvm, sp, shared);
+
+       for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
+               sptep = pt + i;
+               gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1));
+
+               if (shared) {
+                       /*
+                        * Set the SPTE to a nonpresent value that other
+                        * threads will not overwrite. If the SPTE was
+                        * already marked as removed then another thread
+                        * handling a page fault could overwrite it, so
+                        * set the SPTE until it is set from some other
+                        * value to the removed SPTE value.
+                        */
+                       for (;;) {
+                               old_child_spte = xchg(sptep, REMOVED_SPTE);
+                               if (!is_removed_spte(old_child_spte))
+                                       break;
+                               cpu_relax();
+                       }
+               } else {
+                       old_child_spte = READ_ONCE(*sptep);
+
+                       /*
+                        * Marking the SPTE as a removed SPTE is not
+                        * strictly necessary here as the MMU lock will
+                        * stop other threads from concurrently modifying
+                        * this SPTE. Using the removed SPTE value keeps
+                        * the two branches consistent and simplifies
+                        * the function.
+                        */
+                       WRITE_ONCE(*sptep, REMOVED_SPTE);
+               }
+               handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
+                                   old_child_spte, REMOVED_SPTE, level - 1,
+                                   shared);
+       }
+
+       kvm_flush_remote_tlbs_with_address(kvm, gfn,
+                                          KVM_PAGES_PER_HPAGE(level));
+
+       call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
+}
+
 /**
  * handle_changed_spte - handle bookkeeping associated with an SPTE change
  * @kvm: kvm instance
@@ -242,22 +368,22 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
  * @old_spte: The value of the SPTE before the change
  * @new_spte: The value of the SPTE after the change
  * @level: the level of the PT the SPTE is part of in the paging structure
+ * @shared: This operation may not be running under the exclusive use of
+ *         the MMU lock and the operation must synchronize with other
+ *         threads that might be modifying SPTEs.
  *
  * Handle bookkeeping that might result from the modification of a SPTE.
  * This function must be called for all TDP SPTE modifications.
  */
 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
-                               u64 old_spte, u64 new_spte, int level)
+                                 u64 old_spte, u64 new_spte, int level,
+                                 bool shared)
 {
        bool was_present = is_shadow_present_pte(old_spte);
        bool is_present = is_shadow_present_pte(new_spte);
        bool was_leaf = was_present && is_last_spte(old_spte, level);
        bool is_leaf = is_present && is_last_spte(new_spte, level);
        bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
-       u64 *pt;
-       struct kvm_mmu_page *sp;
-       u64 old_child_spte;
-       int i;
 
        WARN_ON(level > PT64_ROOT_MAX_LEVEL);
        WARN_ON(level < PG_LEVEL_4K);
@@ -298,15 +424,19 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
         */
        if (!was_present && !is_present) {
                /*
-                * If this change does not involve a MMIO SPTE, it is
-                * unexpected. Log the change, though it should not impact the
-                * guest since both the former and current SPTEs are nonpresent.
+                * If this change does not involve a MMIO SPTE or removed SPTE,
+                * it is unexpected. Log the change, though it should not
+                * impact the guest since both the former and current SPTEs
+                * are nonpresent.
                 */
-               if (WARN_ON(!is_mmio_spte(old_spte) && !is_mmio_spte(new_spte)))
+               if (WARN_ON(!is_mmio_spte(old_spte) &&
+                           !is_mmio_spte(new_spte) &&
+                           !is_removed_spte(new_spte)))
                        pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
                               "should not be replaced with another,\n"
                               "different nonpresent SPTE, unless one or both\n"
-                              "are MMIO SPTEs.\n"
+                              "are MMIO SPTEs, or the new SPTE is\n"
+                              "a temporary removed SPTE.\n"
                               "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
                               as_id, gfn, old_spte, new_spte, level);
                return;
@@ -321,54 +451,127 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
         * Recursively handle child PTs if the change removed a subtree from
         * the paging structure.
         */
-       if (was_present && !was_leaf && (pfn_changed || !is_present)) {
-               pt = spte_to_child_pt(old_spte, level);
-               sp = sptep_to_sp(pt);
+       if (was_present && !was_leaf && (pfn_changed || !is_present))
+               handle_removed_tdp_mmu_page(kvm,
+                               spte_to_child_pt(old_spte, level), shared);
+}
 
-               trace_kvm_mmu_prepare_zap_page(sp);
+static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
+                               u64 old_spte, u64 new_spte, int level,
+                               bool shared)
+{
+       __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
+                             shared);
+       handle_changed_spte_acc_track(old_spte, new_spte, level);
+       handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
+                                     new_spte, level);
+}
 
-               list_del(&sp->link);
+/*
+ * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically and handle the
+ * associated bookkeeping
+ *
+ * @kvm: kvm instance
+ * @iter: a tdp_iter instance currently on the SPTE that should be set
+ * @new_spte: The value the SPTE should be set to
+ * Returns: true if the SPTE was set, false if it was not. If false is returned,
+ *         this function will have no side-effects.
+ */
+static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
+                                          struct tdp_iter *iter,
+                                          u64 new_spte)
+{
+       u64 *root_pt = tdp_iter_root_pt(iter);
+       struct kvm_mmu_page *root = sptep_to_sp(root_pt);
+       int as_id = kvm_mmu_page_as_id(root);
 
-               if (sp->lpage_disallowed)
-                       unaccount_huge_nx_page(kvm, sp);
+       lockdep_assert_held_read(&kvm->mmu_lock);
 
-               for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
-                       old_child_spte = READ_ONCE(*(pt + i));
-                       WRITE_ONCE(*(pt + i), 0);
-                       handle_changed_spte(kvm, as_id,
-                               gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)),
-                               old_child_spte, 0, level - 1);
-               }
+       /*
+        * Do not change removed SPTEs. Only the thread that froze the SPTE
+        * may modify it.
+        */
+       if (iter->old_spte == REMOVED_SPTE)
+               return false;
 
-               kvm_flush_remote_tlbs_with_address(kvm, gfn,
-                                                  KVM_PAGES_PER_HPAGE(level));
+       if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
+                     new_spte) != iter->old_spte)
+               return false;
 
-               free_page((unsigned long)pt);
-               kmem_cache_free(mmu_page_header_cache, sp);
-       }
+       handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
+                           iter->level, true);
+
+       return true;
 }
 
-static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
-                               u64 old_spte, u64 new_spte, int level)
+static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
+                                          struct tdp_iter *iter)
 {
-       __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level);
-       handle_changed_spte_acc_track(old_spte, new_spte, level);
-       handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
-                                     new_spte, level);
+       /*
+        * Freeze the SPTE by setting it to a special,
+        * non-present value. This will stop other threads from
+        * immediately installing a present entry in its place
+        * before the TLBs are flushed.
+        */
+       if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE))
+               return false;
+
+       kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
+                                          KVM_PAGES_PER_HPAGE(iter->level));
+
+       /*
+        * No other thread can overwrite the removed SPTE as they
+        * must either wait on the MMU lock or use
+        * tdp_mmu_set_spte_atomic which will not overrite the
+        * special removed SPTE value. No bookkeeping is needed
+        * here since the SPTE is going from non-present
+        * to non-present.
+        */
+       WRITE_ONCE(*iter->sptep, 0);
+
+       return true;
 }
 
+
+/*
+ * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
+ * @kvm: kvm instance
+ * @iter: a tdp_iter instance currently on the SPTE that should be set
+ * @new_spte: The value the SPTE should be set to
+ * @record_acc_track: Notify the MM subsystem of changes to the accessed state
+ *                   of the page. Should be set unless handling an MMU
+ *                   notifier for access tracking. Leaving record_acc_track
+ *                   unset in that case prevents page accesses from being
+ *                   double counted.
+ * @record_dirty_log: Record the page as dirty in the dirty bitmap if
+ *                   appropriate for the change being made. Should be set
+ *                   unless performing certain dirty logging operations.
+ *                   Leaving record_dirty_log unset in that case prevents page
+ *                   writes from being double counted.
+ */
 static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
                                      u64 new_spte, bool record_acc_track,
                                      bool record_dirty_log)
 {
-       u64 *root_pt = tdp_iter_root_pt(iter);
+       tdp_ptep_t root_pt = tdp_iter_root_pt(iter);
        struct kvm_mmu_page *root = sptep_to_sp(root_pt);
        int as_id = kvm_mmu_page_as_id(root);
 
-       WRITE_ONCE(*iter->sptep, new_spte);
+       lockdep_assert_held_write(&kvm->mmu_lock);
+
+       /*
+        * No thread should be using this function to set SPTEs to the
+        * temporary removed SPTE value.
+        * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
+        * should be used. If operating under the MMU lock in write mode, the
+        * use of the removed SPTE should not be necessary.
+        */
+       WARN_ON(iter->old_spte == REMOVED_SPTE);
+
+       WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
 
        __handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
-                             iter->level);
+                             iter->level, false);
        if (record_acc_track)
                handle_changed_spte_acc_track(iter->old_spte, new_spte,
                                              iter->level);
@@ -413,27 +616,46 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
                         _mmu->shadow_root_level, _start, _end)
 
 /*
- * Flush the TLB if the process should drop kvm->mmu_lock.
- * Return whether the caller still needs to flush the tlb.
+ * Yield if the MMU lock is contended or this thread needs to return control
+ * to the scheduler.
+ *
+ * If this function should yield and flush is set, it will perform a remote
+ * TLB flush before yielding.
+ *
+ * If this function yields, it will also reset the tdp_iter's walk over the
+ * paging structure and the calling function should skip to the next
+ * iteration to allow the iterator to continue its traversal from the
+ * paging structure root.
+ *
+ * Return true if this function yielded and the iterator's traversal was reset.
+ * Return false if a yield was not needed.
  */
-static bool tdp_mmu_iter_flush_cond_resched(struct kvm *kvm, struct tdp_iter *iter)
+static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
+                                            struct tdp_iter *iter, bool flush)
 {
-       if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
-               kvm_flush_remote_tlbs(kvm);
-               cond_resched_lock(&kvm->mmu_lock);
-               tdp_iter_refresh_walk(iter);
+       /* Ensure forward progress has been made before yielding. */
+       if (iter->next_last_level_gfn == iter->yielded_gfn)
                return false;
-       } else {
+
+       if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
+               rcu_read_unlock();
+
+               if (flush)
+                       kvm_flush_remote_tlbs(kvm);
+
+               cond_resched_rwlock_write(&kvm->mmu_lock);
+               rcu_read_lock();
+
+               WARN_ON(iter->gfn > iter->next_last_level_gfn);
+
+               tdp_iter_start(iter, iter->pt_path[iter->root_level - 1],
+                              iter->root_level, iter->min_level,
+                              iter->next_last_level_gfn);
+
                return true;
        }
-}
 
-static void tdp_mmu_iter_cond_resched(struct kvm *kvm, struct tdp_iter *iter)
-{
-       if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
-               cond_resched_lock(&kvm->mmu_lock);
-               tdp_iter_refresh_walk(iter);
-       }
+       return false;
 }
 
 /*
@@ -453,7 +675,15 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
        struct tdp_iter iter;
        bool flush_needed = false;
 
+       rcu_read_lock();
+
        tdp_root_for_each_pte(iter, root, start, end) {
+               if (can_yield &&
+                   tdp_mmu_iter_cond_resched(kvm, &iter, flush_needed)) {
+                       flush_needed = false;
+                       continue;
+               }
+
                if (!is_shadow_present_pte(iter.old_spte))
                        continue;
 
@@ -468,12 +698,10 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
                        continue;
 
                tdp_mmu_set_spte(kvm, &iter, 0);
-
-               if (can_yield)
-                       flush_needed = tdp_mmu_iter_flush_cond_resched(kvm, &iter);
-               else
-                       flush_needed = true;
+               flush_needed = true;
        }
+
+       rcu_read_unlock();
        return flush_needed;
 }
 
@@ -517,21 +745,18 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
        int ret = 0;
        int make_spte_ret = 0;
 
-       if (unlikely(is_noslot_pfn(pfn))) {
+       if (unlikely(is_noslot_pfn(pfn)))
                new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
-               trace_mark_mmio_spte(iter->sptep, iter->gfn, new_spte);
-       } else {
+       else
                make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
                                         pfn, iter->old_spte, prefault, true,
                                         map_writable, !shadow_accessed_mask,
                                         &new_spte);
-               trace_kvm_mmu_set_spte(iter->level, iter->gfn, iter->sptep);
-       }
 
        if (new_spte == iter->old_spte)
                ret = RET_PF_SPURIOUS;
-       else
-               tdp_mmu_set_spte(vcpu->kvm, iter, new_spte);
+       else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
+               return RET_PF_RETRY;
 
        /*
         * If the page fault was caused by a write but the page is write
@@ -545,10 +770,16 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
        }
 
        /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
-       if (unlikely(is_mmio_spte(new_spte)))
+       if (unlikely(is_mmio_spte(new_spte))) {
+               trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
+                                    new_spte);
                ret = RET_PF_EMULATE;
+       } else
+               trace_kvm_mmu_set_spte(iter->level, iter->gfn,
+                                      rcu_dereference(iter->sptep));
 
-       trace_kvm_mmu_set_spte(iter->level, iter->gfn, iter->sptep);
+       trace_kvm_mmu_set_spte(iter->level, iter->gfn,
+                              rcu_dereference(iter->sptep));
        if (!prefault)
                vcpu->stat.pf_fixed++;
 
@@ -586,6 +817,9 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
                                        huge_page_disallowed, &req_level);
 
        trace_kvm_mmu_spte_requested(gpa, level, pfn);
+
+       rcu_read_lock();
+
        tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
                if (nx_huge_page_workaround_enabled)
                        disallowed_hugepage_adjust(iter.old_spte, gfn,
@@ -601,49 +835,61 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
                 */
                if (is_shadow_present_pte(iter.old_spte) &&
                    is_large_pte(iter.old_spte)) {
-                       tdp_mmu_set_spte(vcpu->kvm, &iter, 0);
-
-                       kvm_flush_remote_tlbs_with_address(vcpu->kvm, iter.gfn,
-                                       KVM_PAGES_PER_HPAGE(iter.level));
+                       if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
+                               break;
 
                        /*
                         * The iter must explicitly re-read the spte here
                         * because the new value informs the !present
                         * path below.
                         */
-                       iter.old_spte = READ_ONCE(*iter.sptep);
+                       iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
                }
 
                if (!is_shadow_present_pte(iter.old_spte)) {
                        sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level);
-                       list_add(&sp->link, &vcpu->kvm->arch.tdp_mmu_pages);
                        child_pt = sp->spt;
-                       clear_page(child_pt);
+
                        new_spte = make_nonleaf_spte(child_pt,
                                                     !shadow_accessed_mask);
 
-                       trace_kvm_mmu_get_page(sp, true);
-                       if (huge_page_disallowed && req_level >= iter.level)
-                               account_huge_nx_page(vcpu->kvm, sp);
-
-                       tdp_mmu_set_spte(vcpu->kvm, &iter, new_spte);
+                       if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter,
+                                                   new_spte)) {
+                               tdp_mmu_link_page(vcpu->kvm, sp, true,
+                                                 huge_page_disallowed &&
+                                                 req_level >= iter.level);
+
+                               trace_kvm_mmu_get_page(sp, true);
+                       } else {
+                               tdp_mmu_free_sp(sp);
+                               break;
+                       }
                }
        }
 
-       if (WARN_ON(iter.level != level))
+       if (iter.level != level) {
+               rcu_read_unlock();
                return RET_PF_RETRY;
+       }
 
        ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter,
                                              pfn, prefault);
+       rcu_read_unlock();
 
        return ret;
 }
 
-static int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, unsigned long start,
-               unsigned long end, unsigned long data,
-               int (*handler)(struct kvm *kvm, struct kvm_memory_slot *slot,
-                              struct kvm_mmu_page *root, gfn_t start,
-                              gfn_t end, unsigned long data))
+static __always_inline int
+kvm_tdp_mmu_handle_hva_range(struct kvm *kvm,
+                            unsigned long start,
+                            unsigned long end,
+                            unsigned long data,
+                            int (*handler)(struct kvm *kvm,
+                                           struct kvm_memory_slot *slot,
+                                           struct kvm_mmu_page *root,
+                                           gfn_t start,
+                                           gfn_t end,
+                                           unsigned long data))
 {
        struct kvm_memslots *slots;
        struct kvm_memory_slot *memslot;
@@ -705,6 +951,8 @@ static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot,
        int young = 0;
        u64 new_spte = 0;
 
+       rcu_read_lock();
+
        tdp_root_for_each_leaf_pte(iter, root, start, end) {
                /*
                 * If we have a non-accessed entry we don't need to change the
@@ -736,6 +984,8 @@ static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot,
                trace_kvm_age_page(iter.gfn, iter.level, slot, young);
        }
 
+       rcu_read_unlock();
+
        return young;
 }
 
@@ -781,6 +1031,8 @@ static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot,
        u64 new_spte;
        int need_flush = 0;
 
+       rcu_read_lock();
+
        WARN_ON(pte_huge(*ptep));
 
        new_pfn = pte_pfn(*ptep);
@@ -809,6 +1061,8 @@ static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot,
        if (need_flush)
                kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
 
+       rcu_read_unlock();
+
        return 0;
 }
 
@@ -832,21 +1086,27 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
        u64 new_spte;
        bool spte_set = false;
 
+       rcu_read_lock();
+
        BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
 
        for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
                                   min_level, start, end) {
+               if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
+                       continue;
+
                if (!is_shadow_present_pte(iter.old_spte) ||
-                   !is_last_spte(iter.old_spte, iter.level))
+                   !is_last_spte(iter.old_spte, iter.level) ||
+                   !(iter.old_spte & PT_WRITABLE_MASK))
                        continue;
 
                new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
 
                tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
                spte_set = true;
-
-               tdp_mmu_iter_cond_resched(kvm, &iter);
        }
+
+       rcu_read_unlock();
        return spte_set;
 }
 
@@ -888,7 +1148,12 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
        u64 new_spte;
        bool spte_set = false;
 
+       rcu_read_lock();
+
        tdp_root_for_each_leaf_pte(iter, root, start, end) {
+               if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
+                       continue;
+
                if (spte_ad_need_write_protect(iter.old_spte)) {
                        if (is_writable_pte(iter.old_spte))
                                new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
@@ -903,9 +1168,9 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
 
                tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
                spte_set = true;
-
-               tdp_mmu_iter_cond_resched(kvm, &iter);
        }
+
+       rcu_read_unlock();
        return spte_set;
 }
 
@@ -947,6 +1212,8 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
        struct tdp_iter iter;
        u64 new_spte;
 
+       rcu_read_lock();
+
        tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
                                    gfn + BITS_PER_LONG) {
                if (!mask)
@@ -956,6 +1223,8 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
                    !(mask & (1UL << (iter.gfn - gfn))))
                        continue;
 
+               mask &= ~(1UL << (iter.gfn - gfn));
+
                if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
                        if (is_writable_pte(iter.old_spte))
                                new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
@@ -969,9 +1238,9 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
                }
 
                tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
-
-               mask &= ~(1UL << (iter.gfn - gfn));
        }
+
+       rcu_read_unlock();
 }
 
 /*
@@ -989,7 +1258,7 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
        struct kvm_mmu_page *root;
        int root_as_id;
 
-       lockdep_assert_held(&kvm->mmu_lock);
+       lockdep_assert_held_write(&kvm->mmu_lock);
        for_each_tdp_mmu_root(kvm, root) {
                root_as_id = kvm_mmu_page_as_id(root);
                if (root_as_id != slot->as_id)
@@ -1011,18 +1280,23 @@ static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
        u64 new_spte;
        bool spte_set = false;
 
+       rcu_read_lock();
+
        tdp_root_for_each_pte(iter, root, start, end) {
-               if (!is_shadow_present_pte(iter.old_spte))
+               if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
+                       continue;
+
+               if (!is_shadow_present_pte(iter.old_spte) ||
+                   iter.old_spte & shadow_dirty_mask)
                        continue;
 
                new_spte = iter.old_spte | shadow_dirty_mask;
 
                tdp_mmu_set_spte(kvm, &iter, new_spte);
                spte_set = true;
-
-               tdp_mmu_iter_cond_resched(kvm, &iter);
        }
 
+       rcu_read_unlock();
        return spte_set;
 }
 
@@ -1049,8 +1323,8 @@ bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot)
 }
 
 /*
- * Clear non-leaf entries (and free associated page tables) which could
- * be replaced by large mappings, for GFNs within the slot.
+ * Clear leaf entries which could be replaced by large mappings, for
+ * GFNs within the slot.
  */
 static void zap_collapsible_spte_range(struct kvm *kvm,
                                       struct kvm_mmu_page *root,
@@ -1060,9 +1334,16 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
        kvm_pfn_t pfn;
        bool spte_set = false;
 
+       rcu_read_lock();
+
        tdp_root_for_each_pte(iter, root, start, end) {
+               if (tdp_mmu_iter_cond_resched(kvm, &iter, spte_set)) {
+                       spte_set = false;
+                       continue;
+               }
+
                if (!is_shadow_present_pte(iter.old_spte) ||
-                   is_last_spte(iter.old_spte, iter.level))
+                   !is_last_spte(iter.old_spte, iter.level))
                        continue;
 
                pfn = spte_to_pfn(iter.old_spte);
@@ -1072,9 +1353,10 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
 
                tdp_mmu_set_spte(kvm, &iter, 0);
 
-               spte_set = tdp_mmu_iter_flush_cond_resched(kvm, &iter);
+               spte_set = true;
        }
 
+       rcu_read_unlock();
        if (spte_set)
                kvm_flush_remote_tlbs(kvm);
 }
@@ -1111,6 +1393,8 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
        u64 new_spte;
        bool spte_set = false;
 
+       rcu_read_lock();
+
        tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) {
                if (!is_writable_pte(iter.old_spte))
                        break;
@@ -1122,6 +1406,8 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
                spte_set = true;
        }
 
+       rcu_read_unlock();
+
        return spte_set;
 }
 
@@ -1137,7 +1423,7 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
        int root_as_id;
        bool spte_set = false;
 
-       lockdep_assert_held(&kvm->mmu_lock);
+       lockdep_assert_held_write(&kvm->mmu_lock);
        for_each_tdp_mmu_root(kvm, root) {
                root_as_id = kvm_mmu_page_as_id(root);
                if (root_as_id != slot->as_id)
@@ -1162,10 +1448,14 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
 
        *root_level = vcpu->arch.mmu->shadow_root_level;
 
+       rcu_read_lock();
+
        tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
                leaf = iter.level;
                sptes[leaf] = iter.old_spte;
        }
 
+       rcu_read_unlock();
+
        return leaf;
 }
index cbbdbad..b4b65e3 100644 (file)
@@ -5,10 +5,6 @@
 
 #include <linux/kvm_host.h>
 
-void kvm_mmu_init_tdp_mmu(struct kvm *kvm);
-void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm);
-
-bool is_tdp_mmu_root(struct kvm *kvm, hpa_t root);
 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
 void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root);
 
@@ -47,4 +43,32 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
                         int *root_level);
 
+#ifdef CONFIG_X86_64
+void kvm_mmu_init_tdp_mmu(struct kvm *kvm);
+void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm);
+static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return kvm->arch.tdp_mmu_enabled; }
+static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; }
+#else
+static inline void kvm_mmu_init_tdp_mmu(struct kvm *kvm) {}
+static inline void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) {}
+static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return false; }
+static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; }
+#endif
+
+static inline bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa)
+{
+       struct kvm_mmu_page *sp;
+
+       if (!is_tdp_mmu_enabled(kvm))
+               return false;
+       if (WARN_ON(!VALID_PAGE(hpa)))
+               return false;
+
+       sp = to_shadow_page(hpa);
+       if (WARN_ON(!sp))
+               return false;
+
+       return is_tdp_mmu_page(sp) && sp->root_count;
+}
+
 #endif /* __KVM_X86_MMU_TDP_MMU_H */
index f472fdb..a8502e0 100644 (file)
@@ -75,7 +75,7 @@ bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
        /* variable MTRRs */
        WARN_ON(!(msr >= 0x200 && msr < 0x200 + 2 * KVM_NR_VAR_MTRR));
 
-       mask = (~0ULL) << cpuid_maxphyaddr(vcpu);
+       mask = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
        if ((msr & 1) == 0) {
                /* MTRR base */
                if (!valid_mtrr_type(data & 0xff))
@@ -351,14 +351,14 @@ static void set_var_mtrr_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
        if (var_mtrr_range_is_valid(cur))
                list_del(&mtrr_state->var_ranges[index].node);
 
-       /* Extend the mask with all 1 bits to the left, since those
-        * bits must implicitly be 0.  The bits are then cleared
-        * when reading them.
+       /*
+        * Set all illegal GPA bits in the mask, since those bits must
+        * implicitly be 0.  The bits are then cleared when reading them.
         */
        if (!is_mtrr_mask)
                cur->base = data;
        else
-               cur->mask = data | (-1LL << cpuid_maxphyaddr(vcpu));
+               cur->mask = data | kvm_vcpu_reserved_gpa_bits_raw(vcpu);
 
        /* add it to the list if it's enabled. */
        if (var_mtrr_range_is_valid(cur)) {
@@ -426,7 +426,7 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
                else
                        *pdata = vcpu->arch.mtrr_state.var_ranges[index].mask;
 
-               *pdata &= (1ULL << cpuid_maxphyaddr(vcpu)) - 1;
+               *pdata &= ~kvm_vcpu_reserved_gpa_bits_raw(vcpu);
        }
 
        return 0;
index 67741d2..827886c 100644 (file)
@@ -373,7 +373,7 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
                return 1;
 
        if (!(kvm_read_cr4(vcpu) & X86_CR4_PCE) &&
-           (kvm_x86_ops.get_cpl(vcpu) != 0) &&
+           (static_call(kvm_x86_get_cpl)(vcpu) != 0) &&
            (kvm_read_cr0(vcpu) & X86_CR0_PE))
                return 1;
 
@@ -383,8 +383,11 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
 
 void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
 {
-       if (lapic_in_kernel(vcpu))
+       if (lapic_in_kernel(vcpu)) {
+               if (kvm_x86_ops.pmu_ops->deliver_pmi)
+                       kvm_x86_ops.pmu_ops->deliver_pmi(vcpu);
                kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
+       }
 }
 
 bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
@@ -473,6 +476,9 @@ void kvm_pmu_cleanup(struct kvm_vcpu *vcpu)
                        pmc_stop_counter(pmc);
        }
 
+       if (kvm_x86_ops.pmu_ops->cleanup)
+               kvm_x86_ops.pmu_ops->cleanup(vcpu);
+
        bitmap_zero(pmu->pmc_in_use, X86_PMC_IDX_MAX);
 }
 
index 067fef5..7b30bc9 100644 (file)
@@ -39,6 +39,8 @@ struct kvm_pmu_ops {
        void (*refresh)(struct kvm_vcpu *vcpu);
        void (*init)(struct kvm_vcpu *vcpu);
        void (*reset)(struct kvm_vcpu *vcpu);
+       void (*deliver_pmi)(struct kvm_vcpu *vcpu);
+       void (*cleanup)(struct kvm_vcpu *vcpu);
 };
 
 static inline u64 pmc_bitmask(struct kvm_pmc *pmc)
index 0ef84d5..78bdcfa 100644 (file)
@@ -298,6 +298,23 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
        return 0;
 }
 
+static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
+                                  u32 icrl, u32 icrh)
+{
+       struct kvm_vcpu *vcpu;
+       int i;
+
+       kvm_for_each_vcpu(i, vcpu, kvm) {
+               bool m = kvm_apic_match_dest(vcpu, source,
+                                            icrl & APIC_SHORT_MASK,
+                                            GET_APIC_DEST_FIELD(icrh),
+                                            icrl & APIC_DEST_MASK);
+
+               if (m && !avic_vcpu_is_running(vcpu))
+                       kvm_vcpu_wake_up(vcpu);
+       }
+}
+
 int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
 {
        u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
@@ -324,28 +341,14 @@ int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
                kvm_lapic_reg_write(apic, APIC_ICR2, icrh);
                kvm_lapic_reg_write(apic, APIC_ICR, icrl);
                break;
-       case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: {
-               int i;
-               struct kvm_vcpu *vcpu;
-               struct kvm *kvm = svm->vcpu.kvm;
-               struct kvm_lapic *apic = svm->vcpu.arch.apic;
-
+       case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING:
                /*
                 * At this point, we expect that the AVIC HW has already
                 * set the appropriate IRR bits on the valid target
                 * vcpus. So, we just need to kick the appropriate vcpu.
                 */
-               kvm_for_each_vcpu(i, vcpu, kvm) {
-                       bool m = kvm_apic_match_dest(vcpu, apic,
-                                                    icrl & APIC_SHORT_MASK,
-                                                    GET_APIC_DEST_FIELD(icrh),
-                                                    icrl & APIC_DEST_MASK);
-
-                       if (m && !avic_vcpu_is_running(vcpu))
-                               kvm_vcpu_wake_up(vcpu);
-               }
+               avic_kick_target_vcpus(svm->vcpu.kvm, apic, icrl, icrh);
                break;
-       }
        case AVIC_IPI_FAILURE_INVALID_TARGET:
                WARN_ONCE(1, "Invalid IPI target: index=%u, vcpu=%d, icr=%#0x:%#0x\n",
                          index, svm->vcpu.vcpu_id, icrh, icrl);
index cb4c6ee..cc91738 100644 (file)
@@ -58,7 +58,7 @@ static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
        u64 pdpte;
        int ret;
 
-       ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(__sme_clr(cr3)), &pdpte,
+       ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(cr3), &pdpte,
                                       offset_in_page(cr3) + index * 8, 8);
        if (ret)
                return 0;
@@ -200,6 +200,9 @@ static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
 
+       if (WARN_ON(!is_guest_mode(vcpu)))
+               return true;
+
        if (!nested_svm_vmrun_msrpm(svm)) {
                vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
                vcpu->run->internal.suberror =
@@ -228,6 +231,7 @@ static bool nested_vmcb_check_controls(struct vmcb_control_area *control)
 
 static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb12)
 {
+       struct kvm_vcpu *vcpu = &svm->vcpu;
        bool vmcb12_lma;
 
        if ((vmcb12->save.efer & EFER_SVME) == 0)
@@ -241,18 +245,10 @@ static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb12)
 
        vmcb12_lma = (vmcb12->save.efer & EFER_LME) && (vmcb12->save.cr0 & X86_CR0_PG);
 
-       if (!vmcb12_lma) {
-               if (vmcb12->save.cr4 & X86_CR4_PAE) {
-                       if (vmcb12->save.cr3 & MSR_CR3_LEGACY_PAE_RESERVED_MASK)
-                               return false;
-               } else {
-                       if (vmcb12->save.cr3 & MSR_CR3_LEGACY_RESERVED_MASK)
-                               return false;
-               }
-       } else {
+       if (vmcb12_lma) {
                if (!(vmcb12->save.cr4 & X86_CR4_PAE) ||
                    !(vmcb12->save.cr0 & X86_CR0_PE) ||
-                   (vmcb12->save.cr3 & MSR_CR3_LONG_MBZ_MASK))
+                   kvm_vcpu_is_illegal_gpa(vcpu, vmcb12->save.cr3))
                        return false;
        }
        if (!kvm_is_valid_cr4(&svm->vcpu, vmcb12->save.cr4))
@@ -349,7 +345,7 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm)
 static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
                               bool nested_npt)
 {
-       if (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63))
+       if (kvm_vcpu_is_illegal_gpa(vcpu, cr3))
                return -EINVAL;
 
        if (!nested_npt && is_pae_paging(vcpu) &&
@@ -396,7 +392,7 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
        svm->vmcb->save.rsp = vmcb12->save.rsp;
        svm->vmcb->save.rip = vmcb12->save.rip;
        svm->vmcb->save.dr7 = vmcb12->save.dr7 | DR7_FIXED_1;
-       svm->vcpu.arch.dr6  = vmcb12->save.dr6 | DR6_FIXED_1 | DR6_RTM;
+       svm->vcpu.arch.dr6  = vmcb12->save.dr6 | DR6_ACTIVE_LOW;
        svm->vmcb->save.cpl = vmcb12->save.cpl;
 }
 
index c8ffdbc..874ea30 100644 (file)
@@ -22,6 +22,7 @@
 
 #include "x86.h"
 #include "svm.h"
+#include "svm_ops.h"
 #include "cpuid.h"
 #include "trace.h"
 
@@ -342,6 +343,8 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
        unsigned long first, last;
        int ret;
 
+       lockdep_assert_held(&kvm->lock);
+
        if (ulen == 0 || uaddr + ulen < uaddr)
                return ERR_PTR(-EINVAL);
 
@@ -1039,6 +1042,74 @@ e_unpin_memory:
        return ret;
 }
 
+static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+       void __user *report = (void __user *)(uintptr_t)argp->data;
+       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+       struct sev_data_attestation_report *data;
+       struct kvm_sev_attestation_report params;
+       void __user *p;
+       void *blob = NULL;
+       int ret;
+
+       if (!sev_guest(kvm))
+               return -ENOTTY;
+
+       if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+               return -EFAULT;
+
+       data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
+       if (!data)
+               return -ENOMEM;
+
+       /* User wants to query the blob length */
+       if (!params.len)
+               goto cmd;
+
+       p = (void __user *)(uintptr_t)params.uaddr;
+       if (p) {
+               if (params.len > SEV_FW_BLOB_MAX_SIZE) {
+                       ret = -EINVAL;
+                       goto e_free;
+               }
+
+               ret = -ENOMEM;
+               blob = kmalloc(params.len, GFP_KERNEL);
+               if (!blob)
+                       goto e_free;
+
+               data->address = __psp_pa(blob);
+               data->len = params.len;
+               memcpy(data->mnonce, params.mnonce, sizeof(params.mnonce));
+       }
+cmd:
+       data->handle = sev->handle;
+       ret = sev_issue_cmd(kvm, SEV_CMD_ATTESTATION_REPORT, data, &argp->error);
+       /*
+        * If we query the session length, FW responded with expected data.
+        */
+       if (!params.len)
+               goto done;
+
+       if (ret)
+               goto e_free_blob;
+
+       if (blob) {
+               if (copy_to_user(p, blob, params.len))
+                       ret = -EFAULT;
+       }
+
+done:
+       params.len = data->len;
+       if (copy_to_user(report, &params, sizeof(params)))
+               ret = -EFAULT;
+e_free_blob:
+       kfree(blob);
+e_free:
+       kfree(data);
+       return ret;
+}
+
 int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
 {
        struct kvm_sev_cmd sev_cmd;
@@ -1089,6 +1160,9 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
        case KVM_SEV_LAUNCH_SECRET:
                r = sev_launch_secret(kvm, &sev_cmd);
                break;
+       case KVM_SEV_GET_ATTESTATION_REPORT:
+               r = sev_get_attestation_report(kvm, &sev_cmd);
+               break;
        default:
                r = -EINVAL;
                goto out;
@@ -1119,12 +1193,20 @@ int svm_register_enc_region(struct kvm *kvm,
        if (!region)
                return -ENOMEM;
 
+       mutex_lock(&kvm->lock);
        region->pages = sev_pin_memory(kvm, range->addr, range->size, &region->npages, 1);
        if (IS_ERR(region->pages)) {
                ret = PTR_ERR(region->pages);
+               mutex_unlock(&kvm->lock);
                goto e_free;
        }
 
+       region->uaddr = range->addr;
+       region->size = range->size;
+
+       list_add_tail(&region->list, &sev->regions_list);
+       mutex_unlock(&kvm->lock);
+
        /*
         * The guest may change the memory encryption attribute from C=0 -> C=1
         * or vice versa for this memory range. Lets make sure caches are
@@ -1133,13 +1215,6 @@ int svm_register_enc_region(struct kvm *kvm,
         */
        sev_clflush_pages(region->pages, region->npages);
 
-       region->uaddr = range->addr;
-       region->size = range->size;
-
-       mutex_lock(&kvm->lock);
-       list_add_tail(&region->list, &sev->regions_list);
-       mutex_unlock(&kvm->lock);
-
        return ret;
 
 e_free:
@@ -1415,16 +1490,13 @@ static void sev_es_sync_to_ghcb(struct vcpu_svm *svm)
         * to be returned:
         *   GPRs RAX, RBX, RCX, RDX
         *
-        * Copy their values to the GHCB if they are dirty.
+        * Copy their values, even if they may not have been written during the
+        * VM-Exit.  It's the guest's responsibility to not consume random data.
         */
-       if (kvm_register_is_dirty(vcpu, VCPU_REGS_RAX))
-               ghcb_set_rax(ghcb, vcpu->arch.regs[VCPU_REGS_RAX]);
-       if (kvm_register_is_dirty(vcpu, VCPU_REGS_RBX))
-               ghcb_set_rbx(ghcb, vcpu->arch.regs[VCPU_REGS_RBX]);
-       if (kvm_register_is_dirty(vcpu, VCPU_REGS_RCX))
-               ghcb_set_rcx(ghcb, vcpu->arch.regs[VCPU_REGS_RCX]);
-       if (kvm_register_is_dirty(vcpu, VCPU_REGS_RDX))
-               ghcb_set_rdx(ghcb, vcpu->arch.regs[VCPU_REGS_RDX]);
+       ghcb_set_rax(ghcb, vcpu->arch.regs[VCPU_REGS_RAX]);
+       ghcb_set_rbx(ghcb, vcpu->arch.regs[VCPU_REGS_RBX]);
+       ghcb_set_rcx(ghcb, vcpu->arch.regs[VCPU_REGS_RCX]);
+       ghcb_set_rdx(ghcb, vcpu->arch.regs[VCPU_REGS_RDX]);
 }
 
 static void sev_es_sync_from_ghcb(struct vcpu_svm *svm)
@@ -1994,29 +2066,17 @@ void sev_es_create_vcpu(struct vcpu_svm *svm)
                                            sev_enc_bit));
 }
 
-void sev_es_vcpu_load(struct vcpu_svm *svm, int cpu)
+void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu)
 {
        struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
        struct vmcb_save_area *hostsa;
-       unsigned int i;
 
        /*
         * As an SEV-ES guest, hardware will restore the host state on VMEXIT,
         * of which one step is to perform a VMLOAD. Since hardware does not
         * perform a VMSAVE on VMRUN, the host savearea must be updated.
         */
-       asm volatile(__ex("vmsave %0") : : "a" (__sme_page_pa(sd->save_area)) : "memory");
-
-       /*
-        * Certain MSRs are restored on VMEXIT, only save ones that aren't
-        * restored.
-        */
-       for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) {
-               if (host_save_user_msrs[i].sev_es_restored)
-                       continue;
-
-               rdmsrl(host_save_user_msrs[i].index, svm->host_user_msrs[i]);
-       }
+       vmsave(__sme_page_pa(sd->save_area));
 
        /* XCR0 is restored on VMEXIT, save the current host value */
        hostsa = (struct vmcb_save_area *)(page_address(sd->save_area) + 0x400);
@@ -2029,22 +2089,6 @@ void sev_es_vcpu_load(struct vcpu_svm *svm, int cpu)
        hostsa->xss = host_xss;
 }
 
-void sev_es_vcpu_put(struct vcpu_svm *svm)
-{
-       unsigned int i;
-
-       /*
-        * Certain MSRs are restored on VMEXIT and were saved with vmsave in
-        * sev_es_vcpu_load() above. Only restore ones that weren't.
-        */
-       for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) {
-               if (host_save_user_msrs[i].sev_es_restored)
-                       continue;
-
-               wrmsrl(host_save_user_msrs[i].index, svm->host_user_msrs[i]);
-       }
-}
-
 void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
index 7ef1717..adb3619 100644 (file)
@@ -41,6 +41,7 @@
 #include "trace.h"
 
 #include "svm.h"
+#include "svm_ops.h"
 
 #define __ex(x) __kvm_handle_fault_on_reboot(x)
 
@@ -200,9 +201,9 @@ module_param(sev_es, int, 0444);
 bool __read_mostly dump_invalid_vmcb;
 module_param(dump_invalid_vmcb, bool, 0644);
 
-static u8 rsm_ins_bytes[] = "\x0f\xaa";
+static bool svm_gp_erratum_intercept = true;
 
-static void svm_complete_interrupts(struct vcpu_svm *svm);
+static u8 rsm_ins_bytes[] = "\x0f\xaa";
 
 static unsigned long iopm_base;
 
@@ -246,21 +247,6 @@ u32 svm_msrpm_offset(u32 msr)
 
 #define MAX_INST_SIZE 15
 
-static inline void clgi(void)
-{
-       asm volatile (__ex("clgi"));
-}
-
-static inline void stgi(void)
-{
-       asm volatile (__ex("stgi"));
-}
-
-static inline void invlpga(unsigned long addr, u32 asid)
-{
-       asm volatile (__ex("invlpga %1, %0") : : "c"(asid), "a"(addr));
-}
-
 static int get_max_npt_level(void)
 {
 #ifdef CONFIG_X86_64
@@ -288,6 +274,9 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
                if (!(efer & EFER_SVME)) {
                        svm_leave_nested(svm);
                        svm_set_gif(svm, true);
+                       /* #GP intercept is still needed for vmware backdoor */
+                       if (!enable_vmware_backdoor)
+                               clr_exception_intercept(svm, GP_VECTOR);
 
                        /*
                         * Free the nested guest state, unless we are in SMM.
@@ -304,6 +293,9 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
                                vcpu->arch.efer = old_efer;
                                return ret;
                        }
+
+                       if (svm_gp_erratum_intercept)
+                               set_exception_intercept(svm, GP_VECTOR);
                }
        }
 
@@ -454,6 +446,11 @@ static int has_svm(void)
                return 0;
        }
 
+       if (sev_active()) {
+               pr_info("KVM is unsupported when running as an SEV guest\n");
+               return 0;
+       }
+
        return 1;
 }
 
@@ -920,6 +917,9 @@ static __init void svm_set_cpu_caps(void)
 
                if (npt_enabled)
                        kvm_cpu_cap_set(X86_FEATURE_NPT);
+
+               /* Nested VM can receive #VMEXIT instead of triggering #GP */
+               kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
        }
 
        /* CPUID 0x80000008 */
@@ -1027,6 +1027,9 @@ static __init int svm_hardware_setup(void)
                }
        }
 
+       if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK))
+               svm_gp_erratum_intercept = false;
+
        if (vgif) {
                if (!boot_cpu_has(X86_FEATURE_VGIF))
                        vgif = false;
@@ -1202,7 +1205,7 @@ static void init_vmcb(struct vcpu_svm *svm)
 
        svm_set_efer(&svm->vcpu, 0);
        save->dr6 = 0xffff0ff0;
-       kvm_set_rflags(&svm->vcpu, 2);
+       kvm_set_rflags(&svm->vcpu, X86_EFLAGS_FIXED);
        save->rip = 0x0000fff0;
        svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
 
@@ -1361,6 +1364,7 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
                svm->vmsa = page_address(vmsa_page);
 
        svm->asid_generation = 0;
+       svm->guest_state_loaded = false;
        init_vmcb(svm);
 
        svm_init_osvw(vcpu);
@@ -1408,30 +1412,31 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
        __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
 }
 
-static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
-       struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
-       int i;
+       struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
+       unsigned int i;
 
-       if (unlikely(cpu != vcpu->cpu)) {
-               svm->asid_generation = 0;
-               vmcb_mark_all_dirty(svm->vmcb);
-       }
+       if (svm->guest_state_loaded)
+               return;
 
+       /*
+        * Certain MSRs are restored on VMEXIT (sev-es), or vmload of host save
+        * area (non-sev-es). Save ones that aren't so we can restore them
+        * individually later.
+        */
+       for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
+               rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
+
+       /*
+        * Save additional host state that will be restored on VMEXIT (sev-es)
+        * or subsequent vmload of host save area.
+        */
        if (sev_es_guest(svm->vcpu.kvm)) {
-               sev_es_vcpu_load(svm, cpu);
+               sev_es_prepare_guest_switch(svm, vcpu->cpu);
        } else {
-#ifdef CONFIG_X86_64
-               rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
-#endif
-               savesegment(fs, svm->host.fs);
-               savesegment(gs, svm->host.gs);
-               svm->host.ldt = kvm_read_ldt();
-
-               for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
-                       rdmsrl(host_save_user_msrs[i].index,
-                              svm->host_user_msrs[i]);
+               vmsave(__sme_page_pa(sd->save_area));
        }
 
        if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
@@ -1441,10 +1446,42 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
                        wrmsrl(MSR_AMD64_TSC_RATIO, tsc_ratio);
                }
        }
+
        /* This assumes that the kernel never uses MSR_TSC_AUX */
        if (static_cpu_has(X86_FEATURE_RDTSCP))
                wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
 
+       svm->guest_state_loaded = true;
+}
+
+static void svm_prepare_host_switch(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+       unsigned int i;
+
+       if (!svm->guest_state_loaded)
+               return;
+
+       /*
+        * Certain MSRs are restored on VMEXIT (sev-es), or vmload of host save
+        * area (non-sev-es). Restore the ones that weren't.
+        */
+       for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
+               wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
+
+       svm->guest_state_loaded = false;
+}
+
+static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+       struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
+
+       if (unlikely(cpu != vcpu->cpu)) {
+               svm->asid_generation = 0;
+               vmcb_mark_all_dirty(svm->vmcb);
+       }
+
        if (sd->current_vmcb != svm->vmcb) {
                sd->current_vmcb = svm->vmcb;
                indirect_branch_prediction_barrier();
@@ -1454,30 +1491,10 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 static void svm_vcpu_put(struct kvm_vcpu *vcpu)
 {
-       struct vcpu_svm *svm = to_svm(vcpu);
-       int i;
-
        avic_vcpu_put(vcpu);
+       svm_prepare_host_switch(vcpu);
 
        ++vcpu->stat.host_state_reload;
-       if (sev_es_guest(svm->vcpu.kvm)) {
-               sev_es_vcpu_put(svm);
-       } else {
-               kvm_load_ldt(svm->host.ldt);
-#ifdef CONFIG_X86_64
-               loadsegment(fs, svm->host.fs);
-               wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gsbase);
-               load_gs_index(svm->host.gs);
-#else
-#ifdef CONFIG_X86_32_LAZY_GS
-               loadsegment(gs, svm->host.gs);
-#endif
-#endif
-
-               for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
-                       wrmsrl(host_save_user_msrs[i].index,
-                              svm->host_user_msrs[i]);
-       }
 }
 
 static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
@@ -1810,7 +1827,7 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
        vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
 }
 
-static void update_exception_bitmap(struct kvm_vcpu *vcpu)
+static void svm_update_exception_bitmap(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
 
@@ -1860,7 +1877,7 @@ static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
        get_debugreg(vcpu->arch.db[2], 2);
        get_debugreg(vcpu->arch.db[3], 3);
        /*
-        * We cannot reset svm->vmcb->save.dr6 to DR6_FIXED_1|DR6_RTM here,
+        * We cannot reset svm->vmcb->save.dr6 to DR6_ACTIVE_LOW here,
         * because db_interception might need it.  We can do it before vmentry.
         */
        vcpu->arch.dr6 = svm->vmcb->save.dr6;
@@ -1911,7 +1928,7 @@ static int db_interception(struct vcpu_svm *svm)
        if (!(svm->vcpu.guest_debug &
              (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
                !svm->nmi_singlestep) {
-               u32 payload = (svm->vmcb->save.dr6 ^ DR6_RTM) & ~DR6_FIXED_1;
+               u32 payload = svm->vmcb->save.dr6 ^ DR6_ACTIVE_LOW;
                kvm_queue_exception_p(&svm->vcpu, DB_VECTOR, payload);
                return 1;
        }
@@ -1957,24 +1974,6 @@ static int ac_interception(struct vcpu_svm *svm)
        return 1;
 }
 
-static int gp_interception(struct vcpu_svm *svm)
-{
-       struct kvm_vcpu *vcpu = &svm->vcpu;
-       u32 error_code = svm->vmcb->control.exit_info_1;
-
-       WARN_ON_ONCE(!enable_vmware_backdoor);
-
-       /*
-        * VMware backdoor emulation on #GP interception only handles IN{S},
-        * OUT{S}, and RDPMC, none of which generate a non-zero error code.
-        */
-       if (error_code) {
-               kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
-               return 1;
-       }
-       return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
-}
-
 static bool is_erratum_383(void)
 {
        int err, i;
@@ -2173,6 +2172,102 @@ static int vmrun_interception(struct vcpu_svm *svm)
        return nested_svm_vmrun(svm);
 }
 
+enum {
+       NONE_SVM_INSTR,
+       SVM_INSTR_VMRUN,
+       SVM_INSTR_VMLOAD,
+       SVM_INSTR_VMSAVE,
+};
+
+/* Return NONE_SVM_INSTR if not SVM instrs, otherwise return decode result */
+static int svm_instr_opcode(struct kvm_vcpu *vcpu)
+{
+       struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
+
+       if (ctxt->b != 0x1 || ctxt->opcode_len != 2)
+               return NONE_SVM_INSTR;
+
+       switch (ctxt->modrm) {
+       case 0xd8: /* VMRUN */
+               return SVM_INSTR_VMRUN;
+       case 0xda: /* VMLOAD */
+               return SVM_INSTR_VMLOAD;
+       case 0xdb: /* VMSAVE */
+               return SVM_INSTR_VMSAVE;
+       default:
+               break;
+       }
+
+       return NONE_SVM_INSTR;
+}
+
+static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
+{
+       const int guest_mode_exit_codes[] = {
+               [SVM_INSTR_VMRUN] = SVM_EXIT_VMRUN,
+               [SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD,
+               [SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE,
+       };
+       int (*const svm_instr_handlers[])(struct vcpu_svm *svm) = {
+               [SVM_INSTR_VMRUN] = vmrun_interception,
+               [SVM_INSTR_VMLOAD] = vmload_interception,
+               [SVM_INSTR_VMSAVE] = vmsave_interception,
+       };
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       if (is_guest_mode(vcpu)) {
+               svm->vmcb->control.exit_code = guest_mode_exit_codes[opcode];
+               svm->vmcb->control.exit_info_1 = 0;
+               svm->vmcb->control.exit_info_2 = 0;
+
+               return nested_svm_vmexit(svm);
+       } else
+               return svm_instr_handlers[opcode](svm);
+}
+
+/*
+ * #GP handling code. Note that #GP can be triggered under the following two
+ * cases:
+ *   1) SVM VM-related instructions (VMRUN/VMSAVE/VMLOAD) that trigger #GP on
+ *      some AMD CPUs when EAX of these instructions are in the reserved memory
+ *      regions (e.g. SMM memory on host).
+ *   2) VMware backdoor
+ */
+static int gp_interception(struct vcpu_svm *svm)
+{
+       struct kvm_vcpu *vcpu = &svm->vcpu;
+       u32 error_code = svm->vmcb->control.exit_info_1;
+       int opcode;
+
+       /* Both #GP cases have zero error_code */
+       if (error_code)
+               goto reinject;
+
+       /* Decode the instruction for usage later */
+       if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK)
+               goto reinject;
+
+       opcode = svm_instr_opcode(vcpu);
+
+       if (opcode == NONE_SVM_INSTR) {
+               if (!enable_vmware_backdoor)
+                       goto reinject;
+
+               /*
+                * VMware backdoor emulation on #GP interception only handles
+                * IN{S}, OUT{S}, and RDPMC.
+                */
+               if (!is_guest_mode(vcpu))
+                       return kvm_emulate_instruction(vcpu,
+                               EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE);
+       } else
+               return emulate_svm_instr(vcpu, opcode);
+
+reinject:
+       kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
+       return 1;
+}
+
 void svm_set_gif(struct vcpu_svm *svm, bool value)
 {
        if (value) {
@@ -2260,11 +2355,8 @@ static int xsetbv_interception(struct vcpu_svm *svm)
        u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
        u32 index = kvm_rcx_read(&svm->vcpu);
 
-       if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) {
-               return kvm_skip_emulated_instruction(&svm->vcpu);
-       }
-
-       return 1;
+       int err = kvm_set_xcr(&svm->vcpu, index, new_bv);
+       return kvm_complete_insn_gp(&svm->vcpu, err);
 }
 
 static int rdpru_interception(struct vcpu_svm *svm)
@@ -2525,6 +2617,7 @@ static int dr_interception(struct vcpu_svm *svm)
 {
        int reg, dr;
        unsigned long val;
+       int err = 0;
 
        if (svm->vcpu.guest_debug == 0) {
                /*
@@ -2542,20 +2635,16 @@ static int dr_interception(struct vcpu_svm *svm)
 
        reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
        dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
-
-       if (dr >= 16) { /* mov to DRn */
-               if (!kvm_require_dr(&svm->vcpu, dr - 16))
-                       return 1;
+       if (dr >= 16) { /* mov to DRn  */
+               dr -= 16;
                val = kvm_register_read(&svm->vcpu, reg);
-               kvm_set_dr(&svm->vcpu, dr - 16, val);
+               err = kvm_set_dr(&svm->vcpu, dr, val);
        } else {
-               if (!kvm_require_dr(&svm->vcpu, dr))
-                       return 1;
                kvm_get_dr(&svm->vcpu, dr, &val);
                kvm_register_write(&svm->vcpu, reg, val);
        }
 
-       return kvm_skip_emulated_instruction(&svm->vcpu);
+       return kvm_complete_insn_gp(&svm->vcpu, err);
 }
 
 static int cr8_write_interception(struct vcpu_svm *svm)
@@ -3349,7 +3438,7 @@ static void svm_set_irq(struct kvm_vcpu *vcpu)
                SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
 }
 
-static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
+static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
 
@@ -3474,7 +3563,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
        return !svm_interrupt_blocked(vcpu);
 }
 
-static void enable_irq_window(struct kvm_vcpu *vcpu)
+static void svm_enable_irq_window(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
 
@@ -3498,7 +3587,7 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
        }
 }
 
-static void enable_nmi_window(struct kvm_vcpu *vcpu)
+static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
 
@@ -3555,10 +3644,6 @@ static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
        invlpga(gva, svm->vmcb->control.asid);
 }
 
-static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
-{
-}
-
 static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
@@ -3703,16 +3788,11 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu,
        if (sev_es_guest(svm->vcpu.kvm)) {
                __svm_sev_es_vcpu_run(svm->vmcb_pa);
        } else {
+               struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
+
                __svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs);
 
-#ifdef CONFIG_X86_64
-               native_wrmsrl(MSR_GS_BASE, svm->host.gs_base);
-#else
-               loadsegment(fs, svm->host.fs);
-#ifndef CONFIG_X86_32_LAZY_GS
-               loadsegment(gs, svm->host.gs);
-#endif
-#endif
+               vmload(__sme_page_pa(sd->save_area));
        }
 
        /*
@@ -3739,6 +3819,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
 
+       trace_kvm_entry(vcpu);
+
        svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
        svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
        svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
@@ -3776,7 +3858,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
        if (unlikely(svm->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
                svm_set_dr6(svm, vcpu->arch.dr6);
        else
-               svm_set_dr6(svm, DR6_FIXED_1 | DR6_RTM);
+               svm_set_dr6(svm, DR6_ACTIVE_LOW);
 
        clgi();
        kvm_load_guest_xsave_state(vcpu);
@@ -3971,7 +4053,7 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
        if (sev_guest(vcpu->kvm)) {
                best = kvm_find_cpuid_entry(vcpu, 0x8000001F, 0);
                if (best)
-                       vcpu->arch.cr3_lm_rsvd_bits &= ~(1UL << (best->ebx & 0x3f));
+                       vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f));
        }
 
        if (!kvm_vcpu_apicv_active(vcpu))
@@ -4278,7 +4360,7 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
        return ret;
 }
 
-static void enable_smi_window(struct kvm_vcpu *vcpu)
+static void svm_enable_smi_window(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
 
@@ -4432,7 +4514,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
        .vcpu_blocking = svm_vcpu_blocking,
        .vcpu_unblocking = svm_vcpu_unblocking,
 
-       .update_exception_bitmap = update_exception_bitmap,
+       .update_exception_bitmap = svm_update_exception_bitmap,
        .get_msr_feature = svm_get_msr_feature,
        .get_msr = svm_get_msr,
        .set_msr = svm_set_msr,
@@ -4475,9 +4557,9 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
        .nmi_allowed = svm_nmi_allowed,
        .get_nmi_mask = svm_get_nmi_mask,
        .set_nmi_mask = svm_set_nmi_mask,
-       .enable_nmi_window = enable_nmi_window,
-       .enable_irq_window = enable_irq_window,
-       .update_cr8_intercept = update_cr8_intercept,
+       .enable_nmi_window = svm_enable_nmi_window,
+       .enable_irq_window = svm_enable_irq_window,
+       .update_cr8_intercept = svm_update_cr8_intercept,
        .set_virtual_apic_mode = svm_set_virtual_apic_mode,
        .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl,
        .check_apicv_inhibit_reasons = svm_check_apicv_inhibit_reasons,
@@ -4520,7 +4602,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
        .smi_allowed = svm_smi_allowed,
        .pre_enter_smm = svm_pre_enter_smm,
        .pre_leave_smm = svm_pre_leave_smm,
-       .enable_smi_window = enable_smi_window,
+       .enable_smi_window = svm_enable_smi_window,
 
        .mem_enc_op = svm_mem_enc_op,
        .mem_enc_reg_region = svm_register_enc_region,
index 0fe874a..39e071f 100644 (file)
 
 #define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT)
 
-static const struct svm_host_save_msrs {
-       u32 index;              /* Index of the MSR */
-       bool sev_es_restored;   /* True if MSR is restored on SEV-ES VMEXIT */
-} host_save_user_msrs[] = {
-#ifdef CONFIG_X86_64
-       { .index = MSR_STAR,                    .sev_es_restored = true },
-       { .index = MSR_LSTAR,                   .sev_es_restored = true },
-       { .index = MSR_CSTAR,                   .sev_es_restored = true },
-       { .index = MSR_SYSCALL_MASK,            .sev_es_restored = true },
-       { .index = MSR_KERNEL_GS_BASE,          .sev_es_restored = true },
-       { .index = MSR_FS_BASE,                 .sev_es_restored = true },
-#endif
-       { .index = MSR_IA32_SYSENTER_CS,        .sev_es_restored = true },
-       { .index = MSR_IA32_SYSENTER_ESP,       .sev_es_restored = true },
-       { .index = MSR_IA32_SYSENTER_EIP,       .sev_es_restored = true },
-       { .index = MSR_TSC_AUX,                 .sev_es_restored = false },
+static const u32 host_save_user_msrs[] = {
+       MSR_TSC_AUX,
 };
 #define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
 
@@ -130,12 +116,6 @@ struct vcpu_svm {
        u64 next_rip;
 
        u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
-       struct {
-               u16 fs;
-               u16 gs;
-               u16 ldt;
-               u64 gs_base;
-       } host;
 
        u64 spec_ctrl;
        /*
@@ -192,6 +172,8 @@ struct vcpu_svm {
        u64 ghcb_sa_len;
        bool ghcb_sa_sync;
        bool ghcb_sa_free;
+
+       bool guest_state_loaded;
 };
 
 struct svm_cpu_data {
@@ -403,9 +385,6 @@ static inline bool gif_set(struct vcpu_svm *svm)
 }
 
 /* svm.c */
-#define MSR_CR3_LEGACY_RESERVED_MASK           0xfe7U
-#define MSR_CR3_LEGACY_PAE_RESERVED_MASK       0x7U
-#define MSR_CR3_LONG_MBZ_MASK                  0xfff0000000000000U
 #define MSR_INVALID                            0xffffffffU
 
 extern int sev;
@@ -590,9 +569,8 @@ int sev_handle_vmgexit(struct vcpu_svm *svm);
 int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in);
 void sev_es_init_vmcb(struct vcpu_svm *svm);
 void sev_es_create_vcpu(struct vcpu_svm *svm);
-void sev_es_vcpu_load(struct vcpu_svm *svm, int cpu);
-void sev_es_vcpu_put(struct vcpu_svm *svm);
 void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
+void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu);
 
 /* vmenter.S */
 
diff --git a/arch/x86/kvm/svm/svm_ops.h b/arch/x86/kvm/svm/svm_ops.h
new file mode 100644 (file)
index 0000000..8170f2a
--- /dev/null
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_SVM_OPS_H
+#define __KVM_X86_SVM_OPS_H
+
+#include <linux/compiler_types.h>
+
+#include <asm/kvm_host.h>
+
+#define svm_asm(insn, clobber...)                              \
+do {                                                           \
+       asm_volatile_goto("1: " __stringify(insn) "\n\t"        \
+                         _ASM_EXTABLE(1b, %l[fault])           \
+                         ::: clobber : fault);                 \
+       return;                                                 \
+fault:                                                         \
+       kvm_spurious_fault();                                   \
+} while (0)
+
+#define svm_asm1(insn, op1, clobber...)                                \
+do {                                                           \
+       asm_volatile_goto("1: "  __stringify(insn) " %0\n\t"    \
+                         _ASM_EXTABLE(1b, %l[fault])           \
+                         :: op1 : clobber : fault);            \
+       return;                                                 \
+fault:                                                         \
+       kvm_spurious_fault();                                   \
+} while (0)
+
+#define svm_asm2(insn, op1, op2, clobber...)                           \
+do {                                                                   \
+       asm_volatile_goto("1: "  __stringify(insn) " %1, %0\n\t"        \
+                         _ASM_EXTABLE(1b, %l[fault])                   \
+                         :: op1, op2 : clobber : fault);               \
+       return;                                                         \
+fault:                                                                 \
+       kvm_spurious_fault();                                           \
+} while (0)
+
+static inline void clgi(void)
+{
+       svm_asm(clgi);
+}
+
+static inline void stgi(void)
+{
+       svm_asm(stgi);
+}
+
+static inline void invlpga(unsigned long addr, u32 asid)
+{
+       svm_asm2(invlpga, "c"(asid), "a"(addr));
+}
+
+/*
+ * Despite being a physical address, the portion of rAX that is consumed by
+ * VMSAVE, VMLOAD, etc... is still controlled by the effective address size,
+ * hence 'unsigned long' instead of 'hpa_t'.
+ */
+static inline void vmsave(unsigned long pa)
+{
+       svm_asm1(vmsave, "a" (pa), "memory");
+}
+
+static inline void vmload(unsigned long pa)
+{
+       svm_asm1(vmload, "a" (pa), "memory");
+}
+
+#endif /* __KVM_X86_SVM_OPS_H */
index 2de30c2..a61c015 100644 (file)
@@ -92,6 +92,42 @@ TRACE_EVENT(kvm_hv_hypercall,
                  __entry->outgpa)
 );
 
+/*
+ * Tracepoint for Xen hypercall.
+ */
+TRACE_EVENT(kvm_xen_hypercall,
+       TP_PROTO(unsigned long nr, unsigned long a0, unsigned long a1,
+                unsigned long a2, unsigned long a3, unsigned long a4,
+                unsigned long a5),
+           TP_ARGS(nr, a0, a1, a2, a3, a4, a5),
+
+       TP_STRUCT__entry(
+               __field(unsigned long, nr)
+               __field(unsigned long, a0)
+               __field(unsigned long, a1)
+               __field(unsigned long, a2)
+               __field(unsigned long, a3)
+               __field(unsigned long, a4)
+               __field(unsigned long, a5)
+       ),
+
+       TP_fast_assign(
+               __entry->nr = nr;
+               __entry->a0 = a0;
+               __entry->a1 = a1;
+               __entry->a2 = a2;
+               __entry->a3 = a3;
+               __entry->a4 = a4;
+               __entry->a4 = a5;
+       ),
+
+       TP_printk("nr 0x%lx a0 0x%lx a1 0x%lx a2 0x%lx a3 0x%lx a4 0x%lx a5 %lx",
+                 __entry->nr, __entry->a0, __entry->a1,  __entry->a2,
+                 __entry->a3, __entry->a4, __entry->a5)
+);
+
+
+
 /*
  * Tracepoint for PIO.
  */
@@ -256,7 +292,7 @@ TRACE_EVENT(name,                                                        \
                __entry->guest_rip      = kvm_rip_read(vcpu);                \
                __entry->isa            = isa;                               \
                __entry->vcpu_id        = vcpu->vcpu_id;                     \
-               kvm_x86_ops.get_exit_info(vcpu, &__entry->info1,             \
+               static_call(kvm_x86_get_exit_info)(vcpu, &__entry->info1,    \
                                          &__entry->info2,                   \
                                          &__entry->intr_info,               \
                                          &__entry->error_code);             \
@@ -738,7 +774,7 @@ TRACE_EVENT(kvm_emulate_insn,
                ),
 
        TP_fast_assign(
-               __entry->csbase = kvm_x86_ops.get_segment_base(vcpu, VCPU_SREG_CS);
+               __entry->csbase = static_call(kvm_x86_get_segment_base)(vcpu, VCPU_SREG_CS);
                __entry->len = vcpu->arch.emulate_ctxt->fetch.ptr
                               - vcpu->arch.emulate_ctxt->fetch.data;
                __entry->rip = vcpu->arch.emulate_ctxt->_eip - __entry->len;
index 3a18614..d1d7798 100644 (file)
@@ -19,6 +19,9 @@ extern int __read_mostly pt_mode;
 #define PT_MODE_HOST_GUEST     1
 
 #define PMU_CAP_FW_WRITES      (1ULL << 13)
+#define PMU_CAP_LBR_FMT                0x3f
+
+#define DEBUGCTLMSR_LBR_MASK           (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI)
 
 struct nested_vmx_msrs {
        /*
@@ -262,6 +265,12 @@ static inline bool cpu_has_vmx_tsc_scaling(void)
                SECONDARY_EXEC_TSC_SCALING;
 }
 
+static inline bool cpu_has_vmx_bus_lock_detection(void)
+{
+       return vmcs_config.cpu_based_2nd_exec_ctrl &
+           SECONDARY_EXEC_BUS_LOCK_DETECTION;
+}
+
 static inline bool cpu_has_vmx_apicv(void)
 {
        return cpu_has_vmx_apic_register_virt() &&
@@ -371,11 +380,28 @@ static inline bool vmx_pt_mode_is_host_guest(void)
 
 static inline u64 vmx_get_perf_capabilities(void)
 {
+       u64 perf_cap = 0;
+
+       if (boot_cpu_has(X86_FEATURE_PDCM))
+               rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap);
+
+       perf_cap &= PMU_CAP_LBR_FMT;
+
        /*
         * Since counters are virtualized, KVM would support full
         * width counting unconditionally, even if the host lacks it.
         */
-       return PMU_CAP_FW_WRITES;
+       return PMU_CAP_FW_WRITES | perf_cap;
+}
+
+static inline u64 vmx_supported_debugctl(void)
+{
+       u64 debugctl = 0;
+
+       if (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT)
+               debugctl |= DEBUGCTLMSR_LBR_MASK;
+
+       return debugctl;
 }
 
 #endif /* __KVM_X86_VMX_CAPS_H */
index 0fbb469..b2f0b5e 100644 (file)
@@ -12,6 +12,7 @@
 #include "nested.h"
 #include "pmu.h"
 #include "trace.h"
+#include "vmx.h"
 #include "x86.h"
 
 static bool __read_mostly enable_shadow_vmcs = 1;
@@ -411,8 +412,8 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit
                if (nr == DB_VECTOR) {
                        if (!has_payload) {
                                payload = vcpu->arch.dr6;
-                               payload &= ~(DR6_FIXED_1 | DR6_BT);
-                               payload ^= DR6_RTM;
+                               payload &= ~DR6_BT;
+                               payload ^= DR6_ACTIVE_LOW;
                        }
                        *exit_qual = payload;
                } else
@@ -744,8 +745,7 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
           (CC(!nested_cpu_has_vid(vmcs12)) ||
            CC(!nested_exit_intr_ack_set(vcpu)) ||
            CC((vmcs12->posted_intr_nv & 0xff00)) ||
-           CC((vmcs12->posted_intr_desc_addr & 0x3f)) ||
-           CC((vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu)))))
+           CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64))))
                return -EINVAL;
 
        /* tpr shadow is needed by all apicv features. */
@@ -758,13 +758,11 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
                                       u32 count, u64 addr)
 {
-       int maxphyaddr;
-
        if (count == 0)
                return 0;
-       maxphyaddr = cpuid_maxphyaddr(vcpu);
-       if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
-           (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr)
+
+       if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) ||
+           !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1)))
                return -EINVAL;
 
        return 0;
@@ -1062,14 +1060,6 @@ static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu,
        }
 }
 
-static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val)
-{
-       unsigned long invalid_mask;
-
-       invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu);
-       return (val & invalid_mask) == 0;
-}
-
 /*
  * Returns true if the MMU needs to be sync'd on nested VM-Enter/VM-Exit.
  * tl;dr: the MMU needs a sync if L0 is using shadow paging and L1 didn't
@@ -1121,7 +1111,7 @@ static bool nested_vmx_transition_mmu_sync(struct kvm_vcpu *vcpu)
 static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept,
                               enum vm_entry_failure_code *entry_failure_code)
 {
-       if (CC(!nested_cr3_valid(vcpu, cr3))) {
+       if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3))) {
                *entry_failure_code = ENTRY_FAIL_DEFAULT;
                return -EINVAL;
        }
@@ -2532,7 +2522,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
         * bitwise-or of what L1 wants to trap for L2, and what we want to
         * trap. Note that CR0.TS also needs updating - we do this later.
         */
-       update_exception_bitmap(vcpu);
+       vmx_update_exception_bitmap(vcpu);
        vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
        vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
 
@@ -2635,7 +2625,6 @@ static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12)
 static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
-       int maxphyaddr = cpuid_maxphyaddr(vcpu);
 
        /* Check for memory type validity */
        switch (new_eptp & VMX_EPTP_MT_MASK) {
@@ -2666,7 +2655,7 @@ static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp)
        }
 
        /* Reserved bits should not be set */
-       if (CC(new_eptp >> maxphyaddr || ((new_eptp >> 7) & 0x1f)))
+       if (CC(kvm_vcpu_is_illegal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f)))
                return false;
 
        /* AD, if set, should be supported */
@@ -2850,7 +2839,7 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
 
        if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) ||
            CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) ||
-           CC(!nested_cr3_valid(vcpu, vmcs12->host_cr3)))
+           CC(kvm_vcpu_is_illegal_gpa(vcpu, vmcs12->host_cr3)))
                return -EINVAL;
 
        if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) ||
@@ -3057,35 +3046,8 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
                vmx->loaded_vmcs->host_state.cr4 = cr4;
        }
 
-       asm(
-               "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */
-               "cmp %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t"
-               "je 1f \n\t"
-               __ex("vmwrite %%" _ASM_SP ", %[HOST_RSP]") "\n\t"
-               "mov %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t"
-               "1: \n\t"
-               "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */
-
-               /* Check if vmlaunch or vmresume is needed */
-               "cmpb $0, %c[launched](%[loaded_vmcs])\n\t"
-
-               /*
-                * VMLAUNCH and VMRESUME clear RFLAGS.{CF,ZF} on VM-Exit, set
-                * RFLAGS.CF on VM-Fail Invalid and set RFLAGS.ZF on VM-Fail
-                * Valid.  vmx_vmenter() directly "returns" RFLAGS, and so the
-                * results of VM-Enter is captured via CC_{SET,OUT} to vm_fail.
-                */
-               "call vmx_vmenter\n\t"
-
-               CC_SET(be)
-             : ASM_CALL_CONSTRAINT, CC_OUT(be) (vm_fail)
-             : [HOST_RSP]"r"((unsigned long)HOST_RSP),
-               [loaded_vmcs]"r"(vmx->loaded_vmcs),
-               [launched]"i"(offsetof(struct loaded_vmcs, launched)),
-               [host_state_rsp]"i"(offsetof(struct loaded_vmcs, host_state.rsp)),
-               [wordsize]"i"(sizeof(ulong))
-             : "memory"
-       );
+       vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
+                                vmx->loaded_vmcs->launched);
 
        if (vmx->msr_autoload.host.nr)
                vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
@@ -3124,13 +3086,9 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
        return 0;
 }
 
-static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
+static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu)
 {
-       struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
        struct vcpu_vmx *vmx = to_vmx(vcpu);
-       struct kvm_host_map *map;
-       struct page *page;
-       u64 hpa;
 
        /*
         * hv_evmcs may end up being not mapped after migration (when
@@ -3153,6 +3111,17 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
                }
        }
 
+       return true;
+}
+
+static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
+{
+       struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       struct kvm_host_map *map;
+       struct page *page;
+       u64 hpa;
+
        if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
                /*
                 * Translate L1 physical address to host physical
@@ -3221,6 +3190,18 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
                exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
        else
                exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
+
+       return true;
+}
+
+static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu)
+{
+       if (!nested_get_evmcs_page(vcpu))
+               return false;
+
+       if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu))
+               return false;
+
        return true;
 }
 
@@ -3311,7 +3292,11 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
        enum vm_entry_failure_code entry_failure_code;
        bool evaluate_pending_interrupts;
-       u32 exit_reason, failed_index;
+       union vmx_exit_reason exit_reason = {
+               .basic = EXIT_REASON_INVALID_STATE,
+               .failed_vmentry = 1,
+       };
+       u32 failed_index;
 
        if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
                kvm_vcpu_flush_tlb_current(vcpu);
@@ -3363,7 +3348,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
 
                if (nested_vmx_check_guest_state(vcpu, vmcs12,
                                                 &entry_failure_code)) {
-                       exit_reason = EXIT_REASON_INVALID_STATE;
+                       exit_reason.basic = EXIT_REASON_INVALID_STATE;
                        vmcs12->exit_qualification = entry_failure_code;
                        goto vmentry_fail_vmexit;
                }
@@ -3374,7 +3359,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
                vcpu->arch.tsc_offset += vmcs12->tsc_offset;
 
        if (prepare_vmcs02(vcpu, vmcs12, &entry_failure_code)) {
-               exit_reason = EXIT_REASON_INVALID_STATE;
+               exit_reason.basic = EXIT_REASON_INVALID_STATE;
                vmcs12->exit_qualification = entry_failure_code;
                goto vmentry_fail_vmexit_guest_mode;
        }
@@ -3384,7 +3369,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
                                                   vmcs12->vm_entry_msr_load_addr,
                                                   vmcs12->vm_entry_msr_load_count);
                if (failed_index) {
-                       exit_reason = EXIT_REASON_MSR_LOAD_FAIL;
+                       exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL;
                        vmcs12->exit_qualification = failed_index;
                        goto vmentry_fail_vmexit_guest_mode;
                }
@@ -3452,7 +3437,7 @@ vmentry_fail_vmexit:
                return NVMX_VMENTRY_VMEXIT;
 
        load_vmcs12_host_state(vcpu, vmcs12);
-       vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
+       vmcs12->vm_exit_reason = exit_reason.full;
        if (enable_shadow_vmcs || vmx->nested.hv_evmcs)
                vmx->nested.need_vmcs12_to_shadow_sync = true;
        return NVMX_VMENTRY_VMEXIT;
@@ -5540,7 +5525,12 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
        return kvm_skip_emulated_instruction(vcpu);
 
 fail:
-       nested_vmx_vmexit(vcpu, vmx->exit_reason,
+       /*
+        * This is effectively a reflected VM-Exit, as opposed to a synthesized
+        * nested VM-Exit.  Pass the original exit reason, i.e. don't hardcode
+        * EXIT_REASON_VMFUNC as the exit reason.
+        */
+       nested_vmx_vmexit(vcpu, vmx->exit_reason.full,
                          vmx_get_intr_info(vcpu),
                          vmx_get_exit_qual(vcpu));
        return 1;
@@ -5608,7 +5598,8 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
  * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
  */
 static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
-       struct vmcs12 *vmcs12, u32 exit_reason)
+                                       struct vmcs12 *vmcs12,
+                                       union vmx_exit_reason exit_reason)
 {
        u32 msr_index = kvm_rcx_read(vcpu);
        gpa_t bitmap;
@@ -5622,7 +5613,7 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
         * First we need to figure out which of the four to use:
         */
        bitmap = vmcs12->msr_bitmap;
-       if (exit_reason == EXIT_REASON_MSR_WRITE)
+       if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
                bitmap += 2048;
        if (msr_index >= 0xc0000000) {
                msr_index -= 0xc0000000;
@@ -5759,11 +5750,12 @@ static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12)
  * Return true if L0 wants to handle an exit from L2 regardless of whether or not
  * L1 wants the exit.  Only call this when in is_guest_mode (L2).
  */
-static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason)
+static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu,
+                                    union vmx_exit_reason exit_reason)
 {
        u32 intr_info;
 
-       switch ((u16)exit_reason) {
+       switch ((u16)exit_reason.basic) {
        case EXIT_REASON_EXCEPTION_NMI:
                intr_info = vmx_get_intr_info(vcpu);
                if (is_nmi(intr_info))
@@ -5819,12 +5811,13 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason)
  * Return 1 if L1 wants to intercept an exit from L2.  Only call this when in
  * is_guest_mode (L2).
  */
-static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason)
+static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu,
+                                    union vmx_exit_reason exit_reason)
 {
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
        u32 intr_info;
 
-       switch ((u16)exit_reason) {
+       switch ((u16)exit_reason.basic) {
        case EXIT_REASON_EXCEPTION_NMI:
                intr_info = vmx_get_intr_info(vcpu);
                if (is_nmi(intr_info))
@@ -5943,7 +5936,7 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason)
 bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
-       u32 exit_reason = vmx->exit_reason;
+       union vmx_exit_reason exit_reason = vmx->exit_reason;
        unsigned long exit_qual;
        u32 exit_intr_info;
 
@@ -5962,7 +5955,7 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu)
                goto reflect_vmexit;
        }
 
-       trace_kvm_nested_vmexit(exit_reason, vcpu, KVM_ISA_VMX);
+       trace_kvm_nested_vmexit(exit_reason.full, vcpu, KVM_ISA_VMX);
 
        /* If L0 (KVM) wants the exit, it trumps L1's desires. */
        if (nested_vmx_l0_wants_exit(vcpu, exit_reason))
@@ -5988,7 +5981,7 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu)
        exit_qual = vmx_get_exit_qual(vcpu);
 
 reflect_vmexit:
-       nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info, exit_qual);
+       nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual);
        return true;
 }
 
@@ -6077,11 +6070,14 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
        if (is_guest_mode(vcpu)) {
                sync_vmcs02_to_vmcs12(vcpu, vmcs12);
                sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
-       } else if (!vmx->nested.need_vmcs12_to_shadow_sync) {
-               if (vmx->nested.hv_evmcs)
-                       copy_enlightened_to_vmcs12(vmx);
-               else if (enable_shadow_vmcs)
-                       copy_shadow_to_vmcs12(vmx);
+       } else  {
+               copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
+               if (!vmx->nested.need_vmcs12_to_shadow_sync) {
+                       if (vmx->nested.hv_evmcs)
+                               copy_enlightened_to_vmcs12(vmx);
+                       else if (enable_shadow_vmcs)
+                               copy_shadow_to_vmcs12(vmx);
+               }
        }
 
        BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE);
@@ -6602,7 +6598,7 @@ struct kvm_x86_nested_ops vmx_nested_ops = {
        .hv_timer_pending = nested_vmx_preemption_timer_pending,
        .get_state = vmx_get_nested_state,
        .set_state = vmx_set_nested_state,
-       .get_nested_state_pages = nested_get_vmcs12_pages,
+       .get_nested_state_pages = vmx_get_nested_state_pages,
        .write_log_dirty = nested_vmx_write_pml_buffer,
        .enable_evmcs = nested_enable_evmcs,
        .get_evmcs_version = nested_get_evmcs_version,
index a886a47..d1df618 100644 (file)
@@ -29,7 +29,7 @@ static struct kvm_event_hw_type_mapping intel_arch_events[] = {
        [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES },
        [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
        [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
-       [7] = { 0x00, 0x30, PERF_COUNT_HW_REF_CPU_CYCLES },
+       [7] = { 0x00, 0x03, PERF_COUNT_HW_REF_CPU_CYCLES },
 };
 
 /* mapping between fixed pmc index and intel_arch_events array */
@@ -152,12 +152,17 @@ static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu,
        return &counters[array_index_nospec(idx, num_counters)];
 }
 
-static inline bool fw_writes_is_enabled(struct kvm_vcpu *vcpu)
+static inline u64 vcpu_get_perf_capabilities(struct kvm_vcpu *vcpu)
 {
        if (!guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
-               return false;
+               return 0;
 
-       return vcpu->arch.perf_capabilities & PMU_CAP_FW_WRITES;
+       return vcpu->arch.perf_capabilities;
+}
+
+static inline bool fw_writes_is_enabled(struct kvm_vcpu *vcpu)
+{
+       return (vcpu_get_perf_capabilities(vcpu) & PMU_CAP_FW_WRITES) != 0;
 }
 
 static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr)
@@ -168,6 +173,41 @@ static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr)
        return get_gp_pmc(pmu, msr, MSR_IA32_PMC0);
 }
 
+bool intel_pmu_lbr_is_compatible(struct kvm_vcpu *vcpu)
+{
+       /*
+        * As a first step, a guest could only enable LBR feature if its
+        * cpu model is the same as the host because the LBR registers
+        * would be pass-through to the guest and they're model specific.
+        */
+       return boot_cpu_data.x86_model == guest_cpuid_model(vcpu);
+}
+
+bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu)
+{
+       struct x86_pmu_lbr *lbr = vcpu_to_lbr_records(vcpu);
+
+       return lbr->nr && (vcpu_get_perf_capabilities(vcpu) & PMU_CAP_LBR_FMT);
+}
+
+static bool intel_pmu_is_valid_lbr_msr(struct kvm_vcpu *vcpu, u32 index)
+{
+       struct x86_pmu_lbr *records = vcpu_to_lbr_records(vcpu);
+       bool ret = false;
+
+       if (!intel_pmu_lbr_is_enabled(vcpu))
+               return ret;
+
+       ret = (index == MSR_LBR_SELECT) || (index == MSR_LBR_TOS) ||
+               (index >= records->from && index < records->from + records->nr) ||
+               (index >= records->to && index < records->to + records->nr);
+
+       if (!ret && records->info)
+               ret = (index >= records->info && index < records->info + records->nr);
+
+       return ret;
+}
+
 static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
 {
        struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -183,7 +223,8 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
        default:
                ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) ||
                        get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) ||
-                       get_fixed_pmc(pmu, msr) || get_fw_gp_pmc(pmu, msr);
+                       get_fixed_pmc(pmu, msr) || get_fw_gp_pmc(pmu, msr) ||
+                       intel_pmu_is_valid_lbr_msr(vcpu, msr);
                break;
        }
 
@@ -202,6 +243,111 @@ static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr)
        return pmc;
 }
 
+static inline void intel_pmu_release_guest_lbr_event(struct kvm_vcpu *vcpu)
+{
+       struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+
+       if (lbr_desc->event) {
+               perf_event_release_kernel(lbr_desc->event);
+               lbr_desc->event = NULL;
+               vcpu_to_pmu(vcpu)->event_count--;
+       }
+}
+
+int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu)
+{
+       struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+       struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+       struct perf_event *event;
+
+       /*
+        * The perf_event_attr is constructed in the minimum efficient way:
+        * - set 'pinned = true' to make it task pinned so that if another
+        *   cpu pinned event reclaims LBR, the event->oncpu will be set to -1;
+        * - set '.exclude_host = true' to record guest branches behavior;
+        *
+        * - set '.config = INTEL_FIXED_VLBR_EVENT' to indicates host perf
+        *   schedule the event without a real HW counter but a fake one;
+        *   check is_guest_lbr_event() and __intel_get_event_constraints();
+        *
+        * - set 'sample_type = PERF_SAMPLE_BRANCH_STACK' and
+        *   'branch_sample_type = PERF_SAMPLE_BRANCH_CALL_STACK |
+        *   PERF_SAMPLE_BRANCH_USER' to configure it as a LBR callstack
+        *   event, which helps KVM to save/restore guest LBR records
+        *   during host context switches and reduces quite a lot overhead,
+        *   check branch_user_callstack() and intel_pmu_lbr_sched_task();
+        */
+       struct perf_event_attr attr = {
+               .type = PERF_TYPE_RAW,
+               .size = sizeof(attr),
+               .config = INTEL_FIXED_VLBR_EVENT,
+               .sample_type = PERF_SAMPLE_BRANCH_STACK,
+               .pinned = true,
+               .exclude_host = true,
+               .branch_sample_type = PERF_SAMPLE_BRANCH_CALL_STACK |
+                                       PERF_SAMPLE_BRANCH_USER,
+       };
+
+       if (unlikely(lbr_desc->event)) {
+               __set_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use);
+               return 0;
+       }
+
+       event = perf_event_create_kernel_counter(&attr, -1,
+                                               current, NULL, NULL);
+       if (IS_ERR(event)) {
+               pr_debug_ratelimited("%s: failed %ld\n",
+                                       __func__, PTR_ERR(event));
+               return -ENOENT;
+       }
+       lbr_desc->event = event;
+       pmu->event_count++;
+       __set_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use);
+       return 0;
+}
+
+/*
+ * It's safe to access LBR msrs from guest when they have not
+ * been passthrough since the host would help restore or reset
+ * the LBR msrs records when the guest LBR event is scheduled in.
+ */
+static bool intel_pmu_handle_lbr_msrs_access(struct kvm_vcpu *vcpu,
+                                    struct msr_data *msr_info, bool read)
+{
+       struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+       u32 index = msr_info->index;
+
+       if (!intel_pmu_is_valid_lbr_msr(vcpu, index))
+               return false;
+
+       if (!lbr_desc->event && !intel_pmu_create_guest_lbr_event(vcpu))
+               goto dummy;
+
+       /*
+        * Disable irq to ensure the LBR feature doesn't get reclaimed by the
+        * host at the time the value is read from the msr, and this avoids the
+        * host LBR value to be leaked to the guest. If LBR has been reclaimed,
+        * return 0 on guest reads.
+        */
+       local_irq_disable();
+       if (lbr_desc->event->state == PERF_EVENT_STATE_ACTIVE) {
+               if (read)
+                       rdmsrl(index, msr_info->data);
+               else
+                       wrmsrl(index, msr_info->data);
+               __set_bit(INTEL_PMC_IDX_FIXED_VLBR, vcpu_to_pmu(vcpu)->pmc_in_use);
+               local_irq_enable();
+               return true;
+       }
+       clear_bit(INTEL_PMC_IDX_FIXED_VLBR, vcpu_to_pmu(vcpu)->pmc_in_use);
+       local_irq_enable();
+
+dummy:
+       if (read)
+               msr_info->data = 0;
+       return true;
+}
+
 static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
        struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -236,7 +382,8 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
                        msr_info->data = pmc->eventsel;
                        return 0;
-               }
+               } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, true))
+                       return 0;
        }
 
        return 1;
@@ -307,7 +454,8 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                                reprogram_gp_counter(pmc, data);
                                return 0;
                        }
-               }
+               } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, false))
+                       return 0;
        }
 
        return 1;
@@ -316,6 +464,8 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
 {
        struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+       struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+
        struct x86_pmu_capability x86_pmu;
        struct kvm_cpuid_entry2 *entry;
        union cpuid10_eax eax;
@@ -327,7 +477,6 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
        pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
        pmu->version = 0;
        pmu->reserved_bits = 0xffffffff00200000ull;
-       vcpu->arch.perf_capabilities = 0;
 
        entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
        if (!entry)
@@ -340,12 +489,12 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
                return;
 
        perf_get_x86_pmu_capability(&x86_pmu);
-       if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
-               vcpu->arch.perf_capabilities = vmx_get_perf_capabilities();
 
        pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters,
                                         x86_pmu.num_counters_gp);
+       eax.split.bit_width = min_t(int, eax.split.bit_width, x86_pmu.bit_width_gp);
        pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1;
+       eax.split.mask_length = min_t(int, eax.split.mask_length, x86_pmu.events_mask_len);
        pmu->available_event_types = ~entry->ebx &
                                        ((1ull << eax.split.mask_length) - 1);
 
@@ -355,6 +504,8 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
                pmu->nr_arch_fixed_counters =
                        min_t(int, edx.split.num_counters_fixed,
                              x86_pmu.num_counters_fixed);
+               edx.split.bit_width_fixed = min_t(int,
+                       edx.split.bit_width_fixed, x86_pmu.bit_width_fixed);
                pmu->counter_bitmask[KVM_PMC_FIXED] =
                        ((u64)1 << edx.split.bit_width_fixed) - 1;
        }
@@ -381,12 +532,21 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
                INTEL_PMC_MAX_GENERIC, pmu->nr_arch_fixed_counters);
 
        nested_vmx_pmu_entry_exit_ctls_update(vcpu);
+
+       if (intel_pmu_lbr_is_compatible(vcpu))
+               x86_perf_get_lbr(&lbr_desc->records);
+       else
+               lbr_desc->records.nr = 0;
+
+       if (lbr_desc->records.nr)
+               bitmap_set(pmu->all_valid_pmc_idx, INTEL_PMC_IDX_FIXED_VLBR, 1);
 }
 
 static void intel_pmu_init(struct kvm_vcpu *vcpu)
 {
        int i;
        struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+       struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
 
        for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) {
                pmu->gp_counters[i].type = KVM_PMC_GP;
@@ -401,6 +561,11 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu)
                pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED;
                pmu->fixed_counters[i].current_config = 0;
        }
+
+       vcpu->arch.perf_capabilities = vmx_get_perf_capabilities();
+       lbr_desc->records.nr = 0;
+       lbr_desc->event = NULL;
+       lbr_desc->msr_passthrough = false;
 }
 
 static void intel_pmu_reset(struct kvm_vcpu *vcpu)
@@ -425,6 +590,119 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu)
 
        pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status =
                pmu->global_ovf_ctrl = 0;
+
+       intel_pmu_release_guest_lbr_event(vcpu);
+}
+
+/*
+ * Emulate LBR_On_PMI behavior for 1 < pmu.version < 4.
+ *
+ * If Freeze_LBR_On_PMI = 1, the LBR is frozen on PMI and
+ * the KVM emulates to clear the LBR bit (bit 0) in IA32_DEBUGCTL.
+ *
+ * Guest needs to re-enable LBR to resume branches recording.
+ */
+static void intel_pmu_legacy_freezing_lbrs_on_pmi(struct kvm_vcpu *vcpu)
+{
+       u64 data = vmcs_read64(GUEST_IA32_DEBUGCTL);
+
+       if (data & DEBUGCTLMSR_FREEZE_LBRS_ON_PMI) {
+               data &= ~DEBUGCTLMSR_LBR;
+               vmcs_write64(GUEST_IA32_DEBUGCTL, data);
+       }
+}
+
+static void intel_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
+{
+       u8 version = vcpu_to_pmu(vcpu)->version;
+
+       if (!intel_pmu_lbr_is_enabled(vcpu))
+               return;
+
+       if (version > 1 && version < 4)
+               intel_pmu_legacy_freezing_lbrs_on_pmi(vcpu);
+}
+
+static void vmx_update_intercept_for_lbr_msrs(struct kvm_vcpu *vcpu, bool set)
+{
+       struct x86_pmu_lbr *lbr = vcpu_to_lbr_records(vcpu);
+       int i;
+
+       for (i = 0; i < lbr->nr; i++) {
+               vmx_set_intercept_for_msr(vcpu, lbr->from + i, MSR_TYPE_RW, set);
+               vmx_set_intercept_for_msr(vcpu, lbr->to + i, MSR_TYPE_RW, set);
+               if (lbr->info)
+                       vmx_set_intercept_for_msr(vcpu, lbr->info + i, MSR_TYPE_RW, set);
+       }
+
+       vmx_set_intercept_for_msr(vcpu, MSR_LBR_SELECT, MSR_TYPE_RW, set);
+       vmx_set_intercept_for_msr(vcpu, MSR_LBR_TOS, MSR_TYPE_RW, set);
+}
+
+static inline void vmx_disable_lbr_msrs_passthrough(struct kvm_vcpu *vcpu)
+{
+       struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+
+       if (!lbr_desc->msr_passthrough)
+               return;
+
+       vmx_update_intercept_for_lbr_msrs(vcpu, true);
+       lbr_desc->msr_passthrough = false;
+}
+
+static inline void vmx_enable_lbr_msrs_passthrough(struct kvm_vcpu *vcpu)
+{
+       struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+
+       if (lbr_desc->msr_passthrough)
+               return;
+
+       vmx_update_intercept_for_lbr_msrs(vcpu, false);
+       lbr_desc->msr_passthrough = true;
+}
+
+/*
+ * Higher priority host perf events (e.g. cpu pinned) could reclaim the
+ * pmu resources (e.g. LBR) that were assigned to the guest. This is
+ * usually done via ipi calls (more details in perf_install_in_context).
+ *
+ * Before entering the non-root mode (with irq disabled here), double
+ * confirm that the pmu features enabled to the guest are not reclaimed
+ * by higher priority host events. Otherwise, disallow vcpu's access to
+ * the reclaimed features.
+ */
+void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu)
+{
+       struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+       struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+
+       if (!lbr_desc->event) {
+               vmx_disable_lbr_msrs_passthrough(vcpu);
+               if (vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR)
+                       goto warn;
+               if (test_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use))
+                       goto warn;
+               return;
+       }
+
+       if (lbr_desc->event->state < PERF_EVENT_STATE_ACTIVE) {
+               vmx_disable_lbr_msrs_passthrough(vcpu);
+               __clear_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use);
+               goto warn;
+       } else
+               vmx_enable_lbr_msrs_passthrough(vcpu);
+
+       return;
+
+warn:
+       pr_warn_ratelimited("kvm: vcpu-%d: fail to passthrough LBR.\n",
+               vcpu->vcpu_id);
+}
+
+static void intel_pmu_cleanup(struct kvm_vcpu *vcpu)
+{
+       if (!(vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR))
+               intel_pmu_release_guest_lbr_event(vcpu);
 }
 
 struct kvm_pmu_ops intel_pmu_ops = {
@@ -441,4 +719,6 @@ struct kvm_pmu_ops intel_pmu_ops = {
        .refresh = intel_pmu_refresh,
        .init = intel_pmu_init,
        .reset = intel_pmu_reset,
+       .deliver_pmi = intel_pmu_deliver_pmi,
+       .cleanup = intel_pmu_cleanup,
 };
index f02962d..4831bc4 100644 (file)
@@ -54,7 +54,7 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
 
                dest = cpu_physical_id(cpu);
 
-               if (x2apic_enabled())
+               if (x2apic_mode)
                        new.ndst = dest;
                else
                        new.ndst = (dest << 8) & 0xFF00;
@@ -104,7 +104,7 @@ static void __pi_post_block(struct kvm_vcpu *vcpu)
 
                dest = cpu_physical_id(vcpu->cpu);
 
-               if (x2apic_enabled())
+               if (x2apic_mode)
                        new.ndst = dest;
                else
                        new.ndst = (dest << 8) & 0xFF00;
@@ -174,7 +174,7 @@ int pi_pre_block(struct kvm_vcpu *vcpu)
                 */
                dest = cpu_physical_id(vcpu->pre_pcpu);
 
-               if (x2apic_enabled())
+               if (x2apic_mode)
                        new.ndst = dest;
                else
                        new.ndst = (dest << 8) & 0xFF00;
index e85aa5f..3a64616 100644 (file)
@@ -44,7 +44,7 @@
  * they VM-Fail, whereas a successful VM-Enter + VM-Exit will jump
  * to vmx_vmexit.
  */
-SYM_FUNC_START(vmx_vmenter)
+SYM_FUNC_START_LOCAL(vmx_vmenter)
        /* EFLAGS.ZF is set if VMCS.LAUNCHED == 0 */
        je 2f
 
index 2af05d3..e0a3a9b 100644 (file)
@@ -50,6 +50,7 @@
 #include "capabilities.h"
 #include "cpuid.h"
 #include "evmcs.h"
+#include "hyperv.h"
 #include "irq.h"
 #include "kvm_cache_regs.h"
 #include "lapic.h"
@@ -552,7 +553,7 @@ static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
 {
        struct hv_enlightened_vmcs *evmcs;
        struct hv_partition_assist_pg **p_hv_pa_pg =
-                       &vcpu->kvm->arch.hyperv.hv_pa_pg;
+                       &to_kvm_hv(vcpu->kvm)->hv_pa_pg;
        /*
         * Synthetic VM-Exit is not enabled in current code and so All
         * evmcs in singe VM shares same assist page.
@@ -658,6 +659,14 @@ static bool is_valid_passthrough_msr(u32 msr)
        case MSR_IA32_RTIT_CR3_MATCH:
        case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
                /* PT MSRs. These are handled in pt_update_intercept_for_msr() */
+       case MSR_LBR_SELECT:
+       case MSR_LBR_TOS:
+       case MSR_LBR_INFO_0 ... MSR_LBR_INFO_0 + 31:
+       case MSR_LBR_NHM_FROM ... MSR_LBR_NHM_FROM + 31:
+       case MSR_LBR_NHM_TO ... MSR_LBR_NHM_TO + 31:
+       case MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + 8:
+       case MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + 8:
+               /* LBR MSRs. These are handled in vmx_update_intercept_for_lbr_msrs() */
                return true;
        }
 
@@ -806,7 +815,7 @@ static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
        return *p;
 }
 
-void update_exception_bitmap(struct kvm_vcpu *vcpu)
+void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu)
 {
        u32 eb;
 
@@ -1102,7 +1111,7 @@ static inline bool pt_can_write_msr(struct vcpu_vmx *vmx)
 static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base)
 {
        /* The base must be 128-byte aligned and a legal physical address. */
-       return !kvm_vcpu_is_illegal_gpa(vcpu, base) && !(base & 0x7f);
+       return kvm_vcpu_is_legal_aligned_gpa(vcpu, base, 128);
 }
 
 static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
@@ -1577,7 +1586,7 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
         * i.e. we end up advancing IP with some random value.
         */
        if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
-           to_vmx(vcpu)->exit_reason != EXIT_REASON_EPT_MISCONFIG) {
+           to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
                orig_rip = kvm_rip_read(vcpu);
                rip = orig_rip + vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
 #ifdef CONFIG_X86_64
@@ -1924,6 +1933,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                    !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
                        return 1;
                goto find_uret_msr;
+       case MSR_IA32_DEBUGCTLMSR:
+               msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL);
+               break;
        default:
        find_uret_msr:
                msr = vmx_find_uret_msr(vmx, msr_info->index);
@@ -1947,6 +1959,16 @@ static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,
        return (unsigned long)data;
 }
 
+static u64 vcpu_supported_debugctl(struct kvm_vcpu *vcpu)
+{
+       u64 debugctl = vmx_supported_debugctl();
+
+       if (!intel_pmu_lbr_is_enabled(vcpu))
+               debugctl &= ~DEBUGCTLMSR_LBR_MASK;
+
+       return debugctl;
+}
+
 /*
  * Writes msr value into the appropriate "register".
  * Returns 0 on success, non-0 otherwise.
@@ -1997,14 +2019,29 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                }
                vmcs_writel(GUEST_SYSENTER_ESP, data);
                break;
-       case MSR_IA32_DEBUGCTLMSR:
+       case MSR_IA32_DEBUGCTLMSR: {
+               u64 invalid = data & ~vcpu_supported_debugctl(vcpu);
+               if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) {
+                       if (report_ignored_msrs)
+                               vcpu_unimpl(vcpu, "%s: BTF|LBR in IA32_DEBUGCTLMSR 0x%llx, nop\n",
+                                           __func__, data);
+                       data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
+                       invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
+               }
+
+               if (invalid)
+                       return 1;
+
                if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
                                                VM_EXIT_SAVE_DEBUG_CONTROLS)
                        get_vmcs12(vcpu)->guest_ia32_debugctl = data;
 
-               ret = kvm_set_msr_common(vcpu, msr_info);
-               break;
-
+               vmcs_write64(GUEST_IA32_DEBUGCTL, data);
+               if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event &&
+                   (data & DEBUGCTLMSR_LBR))
+                       intel_pmu_create_guest_lbr_event(vcpu);
+               return 0;
+       }
        case MSR_IA32_BNDCFGS:
                if (!kvm_mpx_supported() ||
                    (!msr_info->host_initiated &&
@@ -2196,6 +2233,18 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                if ((data >> 32) != 0)
                        return 1;
                goto find_uret_msr;
+       case MSR_IA32_PERF_CAPABILITIES:
+               if (data && !vcpu_to_pmu(vcpu)->version)
+                       return 1;
+               if (data & PMU_CAP_LBR_FMT) {
+                       if ((data & PMU_CAP_LBR_FMT) !=
+                           (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT))
+                               return 1;
+                       if (!intel_pmu_lbr_is_compatible(vcpu))
+                               return 1;
+               }
+               ret = kvm_set_msr_common(vcpu, msr_info);
+               break;
 
        default:
        find_uret_msr:
@@ -2265,7 +2314,6 @@ static int kvm_cpu_vmxon(u64 vmxon_pointer)
        u64 msr;
 
        cr4_set_bits(X86_CR4_VMXE);
-       intel_pt_handle_vmx(1);
 
        asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
                          _ASM_EXTABLE(1b, %l[fault])
@@ -2276,7 +2324,6 @@ static int kvm_cpu_vmxon(u64 vmxon_pointer)
 fault:
        WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
                  rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
-       intel_pt_handle_vmx(0);
        cr4_clear_bits(X86_CR4_VMXE);
 
        return -EFAULT;
@@ -2299,9 +2346,13 @@ static int hardware_enable(void)
            !hv_get_vp_assist_page(cpu))
                return -EFAULT;
 
+       intel_pt_handle_vmx(1);
+
        r = kvm_cpu_vmxon(phys_addr);
-       if (r)
+       if (r) {
+               intel_pt_handle_vmx(0);
                return r;
+       }
 
        if (enable_ept)
                ept_sync_global();
@@ -2319,22 +2370,14 @@ static void vmclear_local_loaded_vmcss(void)
                __loaded_vmcs_clear(v);
 }
 
-
-/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
- * tricks.
- */
-static void kvm_cpu_vmxoff(void)
-{
-       asm volatile (__ex("vmxoff"));
-
-       intel_pt_handle_vmx(0);
-       cr4_clear_bits(X86_CR4_VMXE);
-}
-
 static void hardware_disable(void)
 {
        vmclear_local_loaded_vmcss();
-       kvm_cpu_vmxoff();
+
+       if (cpu_vmxoff())
+               kvm_spurious_fault();
+
+       intel_pt_handle_vmx(0);
 }
 
 /*
@@ -2428,7 +2471,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
                        SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
                        SECONDARY_EXEC_PT_USE_GPA |
                        SECONDARY_EXEC_PT_CONCEAL_VMX |
-                       SECONDARY_EXEC_ENABLE_VMFUNC;
+                       SECONDARY_EXEC_ENABLE_VMFUNC |
+                       SECONDARY_EXEC_BUS_LOCK_DETECTION;
                if (cpu_has_sgx())
                        opt2 |= SECONDARY_EXEC_ENCLS_EXITING;
                if (adjust_vmx_controls(min2, opt2,
@@ -2739,7 +2783,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
        vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
                        (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
 
-       update_exception_bitmap(vcpu);
+       vmx_update_exception_bitmap(vcpu);
 
        fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
        fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
@@ -2819,7 +2863,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
 
        vmcs_writel(GUEST_RFLAGS, flags);
        vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
-       update_exception_bitmap(vcpu);
+       vmx_update_exception_bitmap(vcpu);
 
        fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
        fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
@@ -3774,7 +3818,7 @@ static __always_inline void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu,
                vmx_set_msr_bitmap_write(msr_bitmap, msr);
 }
 
-static __always_inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu,
+void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu,
                                                      u32 msr, int type, bool value)
 {
        if (value)
@@ -4269,6 +4313,9 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
        vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG,
                                    ENABLE_USR_WAIT_PAUSE, false);
 
+       if (!vcpu->kvm->arch.bus_lock_detection_enabled)
+               exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION;
+
        vmx->secondary_exec_control = exec_control;
 }
 
@@ -4467,23 +4514,23 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
        vmx_set_cr4(vcpu, 0);
        vmx_set_efer(vcpu, 0);
 
-       update_exception_bitmap(vcpu);
+       vmx_update_exception_bitmap(vcpu);
 
        vpid_sync_context(vmx->vpid);
        if (init_event)
                vmx_clear_hlt(vcpu);
 }
 
-static void enable_irq_window(struct kvm_vcpu *vcpu)
+static void vmx_enable_irq_window(struct kvm_vcpu *vcpu)
 {
        exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
 }
 
-static void enable_nmi_window(struct kvm_vcpu *vcpu)
+static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu)
 {
        if (!enable_vnmi ||
            vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
-               enable_irq_window(vcpu);
+               vmx_enable_irq_window(vcpu);
                return;
        }
 
@@ -4824,7 +4871,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
                        kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
                        return 1;
                }
-               kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1 | DR6_RTM;
+               kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW;
                kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
                fallthrough;
        case BP_VECTOR:
@@ -5049,6 +5096,7 @@ static int handle_dr(struct kvm_vcpu *vcpu)
 {
        unsigned long exit_qualification;
        int dr, dr7, reg;
+       int err = 1;
 
        exit_qualification = vmx_get_exit_qual(vcpu);
        dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
@@ -5057,9 +5105,9 @@ static int handle_dr(struct kvm_vcpu *vcpu)
        if (!kvm_require_dr(vcpu, dr))
                return 1;
 
-       /* Do not handle if the CPL > 0, will trigger GP on re-entry */
-       if (!kvm_require_cpl(vcpu, 0))
-               return 1;
+       if (kvm_x86_ops.get_cpl(vcpu) > 0)
+               goto out;
+
        dr7 = vmcs_readl(GUEST_DR7);
        if (dr7 & DR7_GD) {
                /*
@@ -5068,7 +5116,7 @@ static int handle_dr(struct kvm_vcpu *vcpu)
                 * guest debugging itself.
                 */
                if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
-                       vcpu->run->debug.arch.dr6 = DR6_BD | DR6_RTM | DR6_FIXED_1;
+                       vcpu->run->debug.arch.dr6 = DR6_BD | DR6_ACTIVE_LOW;
                        vcpu->run->debug.arch.dr7 = dr7;
                        vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu);
                        vcpu->run->debug.arch.exception = DB_VECTOR;
@@ -5096,14 +5144,15 @@ static int handle_dr(struct kvm_vcpu *vcpu)
        if (exit_qualification & TYPE_MOV_FROM_DR) {
                unsigned long val;
 
-               if (kvm_get_dr(vcpu, dr, &val))
-                       return 1;
+               kvm_get_dr(vcpu, dr, &val);
                kvm_register_write(vcpu, reg, val);
-       } else
-               if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg)))
-                       return 1;
+               err = 0;
+       } else {
+               err = kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg));
+       }
 
-       return kvm_skip_emulated_instruction(vcpu);
+out:
+       return kvm_complete_insn_gp(vcpu, err);
 }
 
 static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
@@ -5177,9 +5226,8 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu)
        u64 new_bv = kvm_read_edx_eax(vcpu);
        u32 index = kvm_rcx_read(vcpu);
 
-       if (kvm_set_xcr(vcpu, index, new_bv) == 0)
-               return kvm_skip_emulated_instruction(vcpu);
-       return 1;
+       int err = kvm_set_xcr(vcpu, index, new_bv);
+       return kvm_complete_insn_gp(vcpu, err);
 }
 
 static int handle_apic_access(struct kvm_vcpu *vcpu)
@@ -5600,6 +5648,13 @@ static int handle_encls(struct kvm_vcpu *vcpu)
        return 1;
 }
 
+static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
+{
+       vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK;
+       vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK;
+       return 0;
+}
+
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -5656,6 +5711,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
        [EXIT_REASON_VMFUNC]                  = handle_vmx_instruction,
        [EXIT_REASON_PREEMPTION_TIMER]        = handle_preemption_timer,
        [EXIT_REASON_ENCLS]                   = handle_encls,
+       [EXIT_REASON_BUS_LOCK]                = handle_bus_lock_vmexit,
 };
 
 static const int kvm_vmx_max_exit_handlers =
@@ -5667,7 +5723,7 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2,
        struct vcpu_vmx *vmx = to_vmx(vcpu);
 
        *info1 = vmx_get_exit_qual(vcpu);
-       if (!(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
+       if (!(vmx->exit_reason.failed_vmentry)) {
                *info2 = vmx->idt_vectoring_info;
                *intr_info = vmx_get_intr_info(vcpu);
                if (is_exception_with_error_code(*intr_info))
@@ -5908,11 +5964,12 @@ void dump_vmcs(void)
  * The guest has exited.  See if we can fix it or if we need userspace
  * assistance.
  */
-static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
+static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
-       u32 exit_reason = vmx->exit_reason;
+       union vmx_exit_reason exit_reason = vmx->exit_reason;
        u32 vectoring_info = vmx->idt_vectoring_info;
+       u16 exit_handler_index;
 
        /*
         * Flush logged GPAs PML buffer, this will make dirty_bitmap more
@@ -5954,11 +6011,11 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
                        return 1;
        }
 
-       if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
+       if (exit_reason.failed_vmentry) {
                dump_vmcs();
                vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
                vcpu->run->fail_entry.hardware_entry_failure_reason
-                       = exit_reason;
+                       = exit_reason.full;
                vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
                return 0;
        }
@@ -5980,18 +6037,18 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
         * will cause infinite loop.
         */
        if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
-                       (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
-                       exit_reason != EXIT_REASON_EPT_VIOLATION &&
-                       exit_reason != EXIT_REASON_PML_FULL &&
-                       exit_reason != EXIT_REASON_APIC_ACCESS &&
-                       exit_reason != EXIT_REASON_TASK_SWITCH)) {
+           (exit_reason.basic != EXIT_REASON_EXCEPTION_NMI &&
+            exit_reason.basic != EXIT_REASON_EPT_VIOLATION &&
+            exit_reason.basic != EXIT_REASON_PML_FULL &&
+            exit_reason.basic != EXIT_REASON_APIC_ACCESS &&
+            exit_reason.basic != EXIT_REASON_TASK_SWITCH)) {
                vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
                vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
                vcpu->run->internal.ndata = 3;
                vcpu->run->internal.data[0] = vectoring_info;
-               vcpu->run->internal.data[1] = exit_reason;
+               vcpu->run->internal.data[1] = exit_reason.full;
                vcpu->run->internal.data[2] = vcpu->arch.exit_qualification;
-               if (exit_reason == EXIT_REASON_EPT_MISCONFIG) {
+               if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) {
                        vcpu->run->internal.ndata++;
                        vcpu->run->internal.data[3] =
                                vmcs_read64(GUEST_PHYSICAL_ADDRESS);
@@ -6023,42 +6080,62 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
        if (exit_fastpath != EXIT_FASTPATH_NONE)
                return 1;
 
-       if (exit_reason >= kvm_vmx_max_exit_handlers)
+       if (exit_reason.basic >= kvm_vmx_max_exit_handlers)
                goto unexpected_vmexit;
 #ifdef CONFIG_RETPOLINE
-       if (exit_reason == EXIT_REASON_MSR_WRITE)
+       if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
                return kvm_emulate_wrmsr(vcpu);
-       else if (exit_reason == EXIT_REASON_PREEMPTION_TIMER)
+       else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER)
                return handle_preemption_timer(vcpu);
-       else if (exit_reason == EXIT_REASON_INTERRUPT_WINDOW)
+       else if (exit_reason.basic == EXIT_REASON_INTERRUPT_WINDOW)
                return handle_interrupt_window(vcpu);
-       else if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
+       else if (exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
                return handle_external_interrupt(vcpu);
-       else if (exit_reason == EXIT_REASON_HLT)
+       else if (exit_reason.basic == EXIT_REASON_HLT)
                return kvm_emulate_halt(vcpu);
-       else if (exit_reason == EXIT_REASON_EPT_MISCONFIG)
+       else if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG)
                return handle_ept_misconfig(vcpu);
 #endif
 
-       exit_reason = array_index_nospec(exit_reason,
-                                        kvm_vmx_max_exit_handlers);
-       if (!kvm_vmx_exit_handlers[exit_reason])
+       exit_handler_index = array_index_nospec((u16)exit_reason.basic,
+                                               kvm_vmx_max_exit_handlers);
+       if (!kvm_vmx_exit_handlers[exit_handler_index])
                goto unexpected_vmexit;
 
-       return kvm_vmx_exit_handlers[exit_reason](vcpu);
+       return kvm_vmx_exit_handlers[exit_handler_index](vcpu);
 
 unexpected_vmexit:
-       vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", exit_reason);
+       vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
+                   exit_reason.full);
        dump_vmcs();
        vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
        vcpu->run->internal.suberror =
                        KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
        vcpu->run->internal.ndata = 2;
-       vcpu->run->internal.data[0] = exit_reason;
+       vcpu->run->internal.data[0] = exit_reason.full;
        vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
        return 0;
 }
 
+static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
+{
+       int ret = __vmx_handle_exit(vcpu, exit_fastpath);
+
+       /*
+        * Even when current exit reason is handled by KVM internally, we
+        * still need to exit to user space when bus lock detected to inform
+        * that there is a bus lock in guest.
+        */
+       if (to_vmx(vcpu)->exit_reason.bus_lock_detected) {
+               if (ret > 0)
+                       vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK;
+
+               vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK;
+               return 0;
+       }
+       return ret;
+}
+
 /*
  * Software based L1D cache flush which is used when microcode providing
  * the cache control MSR is not loaded.
@@ -6129,7 +6206,7 @@ static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
                : "eax", "ebx", "ecx", "edx");
 }
 
-static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
+static void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
 {
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
        int tpr_threshold;
@@ -6373,9 +6450,9 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
 
-       if (vmx->exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
+       if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
                handle_external_interrupt_irqoff(vcpu);
-       else if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)
+       else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI)
                handle_exception_nmi_irqoff(vmx);
 }
 
@@ -6567,7 +6644,7 @@ void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
 
 static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
 {
-       switch (to_vmx(vcpu)->exit_reason) {
+       switch (to_vmx(vcpu)->exit_reason.basic) {
        case EXIT_REASON_MSR_WRITE:
                return handle_fastpath_set_msr_irqoff(vcpu);
        case EXIT_REASON_PREEMPTION_TIMER:
@@ -6577,8 +6654,6 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
        }
 }
 
-bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched);
-
 static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
                                        struct vcpu_vmx *vmx)
 {
@@ -6638,11 +6713,9 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
 
 static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
 {
-       fastpath_t exit_fastpath;
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        unsigned long cr3, cr4;
 
-reenter_guest:
        /* Record the guest's net vcpu time for enforced NMI injections. */
        if (unlikely(!enable_vnmi &&
                     vmx->loaded_vmcs->soft_vnmi_blocked))
@@ -6653,6 +6726,8 @@ reenter_guest:
        if (vmx->emulation_required)
                return EXIT_FASTPATH_NONE;
 
+       trace_kvm_entry(vcpu);
+
        if (vmx->ple_window_dirty) {
                vmx->ple_window_dirty = false;
                vmcs_write32(PLE_WINDOW, vmx->ple_window);
@@ -6694,6 +6769,8 @@ reenter_guest:
        pt_guest_enter(vmx);
 
        atomic_switch_perf_msrs(vmx);
+       if (intel_pmu_lbr_is_enabled(vcpu))
+               vmx_passthrough_lbr_msrs(vcpu);
 
        if (enable_preemption_timer)
                vmx_update_hv_timer(vcpu);
@@ -6732,12 +6809,12 @@ reenter_guest:
        x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0);
 
        /* All fields are clean at this point */
-       if (static_branch_unlikely(&enable_evmcs))
+       if (static_branch_unlikely(&enable_evmcs)) {
                current_evmcs->hv_clean_fields |=
                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
 
-       if (static_branch_unlikely(&enable_evmcs))
-               current_evmcs->hv_vp_id = vcpu->arch.hyperv.vp_index;
+               current_evmcs->hv_vp_id = kvm_hv_get_vpindex(vcpu);
+       }
 
        /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
        if (vmx->host_debugctlmsr)
@@ -6766,17 +6843,17 @@ reenter_guest:
        vmx->idt_vectoring_info = 0;
 
        if (unlikely(vmx->fail)) {
-               vmx->exit_reason = 0xdead;
+               vmx->exit_reason.full = 0xdead;
                return EXIT_FASTPATH_NONE;
        }
 
-       vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
-       if (unlikely((u16)vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY))
+       vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON);
+       if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY))
                kvm_machine_check();
 
-       trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX);
+       trace_kvm_exit(vmx->exit_reason.full, vcpu, KVM_ISA_VMX);
 
-       if (unlikely(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
+       if (unlikely(vmx->exit_reason.failed_vmentry))
                return EXIT_FASTPATH_NONE;
 
        vmx->loaded_vmcs->launched = 1;
@@ -6788,22 +6865,7 @@ reenter_guest:
        if (is_guest_mode(vcpu))
                return EXIT_FASTPATH_NONE;
 
-       exit_fastpath = vmx_exit_handlers_fastpath(vcpu);
-       if (exit_fastpath == EXIT_FASTPATH_REENTER_GUEST) {
-               if (!kvm_vcpu_exit_request(vcpu)) {
-                       /*
-                        * FIXME: this goto should be a loop in vcpu_enter_guest,
-                        * but it would incur the cost of a retpoline for now.
-                        * Revisit once static calls are available.
-                        */
-                       if (vcpu->arch.apicv_active)
-                               vmx_sync_pir_to_irr(vcpu);
-                       goto reenter_guest;
-               }
-               exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED;
-       }
-
-       return exit_fastpath;
+       return vmx_exit_handlers_fastpath(vcpu);
 }
 
 static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
@@ -6858,11 +6920,20 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
                switch (index) {
                case MSR_IA32_TSX_CTRL:
                        /*
-                        * No need to pass TSX_CTRL_CPUID_CLEAR through, so
-                        * let's avoid changing CPUID bits under the host
-                        * kernel's feet.
+                        * TSX_CTRL_CPUID_CLEAR is handled in the CPUID
+                        * interception.  Keep the host value unchanged to avoid
+                        * changing CPUID bits under the host kernel's feet.
+                        *
+                        * hle=0, rtm=0, tsx_ctrl=1 can be found with some
+                        * combinations of new kernel and old userspace.  If
+                        * those guests run on a tsx=off host, do allow guests
+                        * to use TSX_CTRL, but do not change the value on the
+                        * host so that TSX remains always disabled.
                         */
-                       vmx->guest_uret_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
+                       if (boot_cpu_has(X86_FEATURE_RTM))
+                               vmx->guest_uret_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
+                       else
+                               vmx->guest_uret_msrs[j].mask = 0;
                        break;
                default:
                        vmx->guest_uret_msrs[j].mask = -1ull;
@@ -7245,7 +7316,7 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
        set_cr4_guest_host_mask(vmx);
 
        /* Refresh #PF interception to account for MAXPHYADDR changes. */
-       update_exception_bitmap(vcpu);
+       vmx_update_exception_bitmap(vcpu);
 }
 
 static __init void vmx_set_cpu_caps(void)
@@ -7535,7 +7606,7 @@ static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
        return 0;
 }
 
-static void enable_smi_window(struct kvm_vcpu *vcpu)
+static void vmx_enable_smi_window(struct kvm_vcpu *vcpu)
 {
        /* RSM will cause a vmexit anyway.  */
 }
@@ -7595,7 +7666,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
        .vcpu_load = vmx_vcpu_load,
        .vcpu_put = vmx_vcpu_put,
 
-       .update_exception_bitmap = update_exception_bitmap,
+       .update_exception_bitmap = vmx_update_exception_bitmap,
        .get_msr_feature = vmx_get_msr_feature,
        .get_msr = vmx_get_msr,
        .set_msr = vmx_set_msr,
@@ -7638,9 +7709,9 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
        .nmi_allowed = vmx_nmi_allowed,
        .get_nmi_mask = vmx_get_nmi_mask,
        .set_nmi_mask = vmx_set_nmi_mask,
-       .enable_nmi_window = enable_nmi_window,
-       .enable_irq_window = enable_irq_window,
-       .update_cr8_intercept = update_cr8_intercept,
+       .enable_nmi_window = vmx_enable_nmi_window,
+       .enable_irq_window = vmx_enable_irq_window,
+       .update_cr8_intercept = vmx_update_cr8_intercept,
        .set_virtual_apic_mode = vmx_set_virtual_apic_mode,
        .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
        .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
@@ -7698,7 +7769,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
        .smi_allowed = vmx_smi_allowed,
        .pre_enter_smm = vmx_pre_enter_smm,
        .pre_leave_smm = vmx_pre_leave_smm,
-       .enable_smi_window = enable_smi_window,
+       .enable_smi_window = vmx_enable_smi_window,
 
        .can_emulate_instruction = vmx_can_emulate_instruction,
        .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
@@ -7799,6 +7870,8 @@ static __init int hardware_setup(void)
                kvm_tsc_scaling_ratio_frac_bits = 48;
        }
 
+       kvm_has_bus_lock_exit = cpu_has_vmx_bus_lock_detection();
+
        set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
 
        if (enable_ept)
index 9d3a557..12c53d0 100644 (file)
@@ -70,6 +70,54 @@ struct pt_desc {
        struct pt_ctx guest;
 };
 
+union vmx_exit_reason {
+       struct {
+               u32     basic                   : 16;
+               u32     reserved16              : 1;
+               u32     reserved17              : 1;
+               u32     reserved18              : 1;
+               u32     reserved19              : 1;
+               u32     reserved20              : 1;
+               u32     reserved21              : 1;
+               u32     reserved22              : 1;
+               u32     reserved23              : 1;
+               u32     reserved24              : 1;
+               u32     reserved25              : 1;
+               u32     bus_lock_detected       : 1;
+               u32     enclave_mode            : 1;
+               u32     smi_pending_mtf         : 1;
+               u32     smi_from_vmx_root       : 1;
+               u32     reserved30              : 1;
+               u32     failed_vmentry          : 1;
+       };
+       u32 full;
+};
+
+#define vcpu_to_lbr_desc(vcpu) (&to_vmx(vcpu)->lbr_desc)
+#define vcpu_to_lbr_records(vcpu) (&to_vmx(vcpu)->lbr_desc.records)
+
+bool intel_pmu_lbr_is_compatible(struct kvm_vcpu *vcpu);
+bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu);
+
+int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu);
+void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu);
+
+struct lbr_desc {
+       /* Basic info about guest LBR records. */
+       struct x86_pmu_lbr records;
+
+       /*
+        * Emulate LBR feature via passthrough LBR registers when the
+        * per-vcpu guest LBR event is scheduled on the current pcpu.
+        *
+        * The records may be inaccurate if the host reclaims the LBR.
+        */
+       struct perf_event *event;
+
+       /* True if LBRs are marked as not intercepted in the MSR bitmap */
+       bool msr_passthrough;
+};
+
 /*
  * The nested_vmx structure is part of vcpu_vmx, and holds information we need
  * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
@@ -244,7 +292,7 @@ struct vcpu_vmx {
        int vpid;
        bool emulation_required;
 
-       u32 exit_reason;
+       union vmx_exit_reason exit_reason;
 
        /* Posted interrupt descriptor */
        struct pi_desc pi_desc;
@@ -279,6 +327,7 @@ struct vcpu_vmx {
        u64 ept_pointer;
 
        struct pt_desc pt_desc;
+       struct lbr_desc lbr_desc;
 
        /* Save desired MSR intercept (read: pass-through) state */
 #define MAX_POSSIBLE_PASSTHROUGH_MSRS  13
@@ -329,7 +378,7 @@ void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
 u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa,
                   int root_level);
 
-void update_exception_bitmap(struct kvm_vcpu *vcpu);
+void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu);
 void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
 bool vmx_nmi_blocked(struct kvm_vcpu *vcpu);
 bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu);
@@ -339,8 +388,11 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
 struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr);
 void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu);
 void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp);
+bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched);
 int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr);
 void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu);
+void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu,
+       u32 msr, int type, bool value);
 
 static inline u8 vmx_get_rvi(void)
 {
index 9a8969a..3fa1403 100644 (file)
@@ -29,6 +29,7 @@
 #include "pmu.h"
 #include "hyperv.h"
 #include "lapic.h"
+#include "xen.h"
 
 #include <linux/clocksource.h>
 #include <linux/interrupt.h>
@@ -105,6 +106,7 @@ static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
 
 static void update_cr8_intercept(struct kvm_vcpu *vcpu);
 static void process_nmi(struct kvm_vcpu *vcpu);
+static void process_smi(struct kvm_vcpu *vcpu);
 static void enter_smm(struct kvm_vcpu *vcpu);
 static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
 static void store_regs(struct kvm_vcpu *vcpu);
@@ -113,11 +115,21 @@ static int sync_regs(struct kvm_vcpu *vcpu);
 struct kvm_x86_ops kvm_x86_ops __read_mostly;
 EXPORT_SYMBOL_GPL(kvm_x86_ops);
 
+#define KVM_X86_OP(func)                                            \
+       DEFINE_STATIC_CALL_NULL(kvm_x86_##func,                      \
+                               *(((struct kvm_x86_ops *)0)->func));
+#define KVM_X86_OP_NULL KVM_X86_OP
+#include <asm/kvm-x86-ops.h>
+EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits);
+EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg);
+EXPORT_STATIC_CALL_GPL(kvm_x86_tlb_flush_current);
+
 static bool __read_mostly ignore_msrs = 0;
 module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
 
-static bool __read_mostly report_ignored_msrs = true;
+bool __read_mostly report_ignored_msrs = true;
 module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR);
+EXPORT_SYMBOL_GPL(report_ignored_msrs);
 
 unsigned int min_timer_period_us = 200;
 module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
@@ -135,6 +147,8 @@ u64  __read_mostly kvm_max_tsc_scaling_ratio;
 EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio);
 u64 __read_mostly kvm_default_tsc_scaling_ratio;
 EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio);
+bool __read_mostly kvm_has_bus_lock_exit;
+EXPORT_SYMBOL_GPL(kvm_has_bus_lock_exit);
 
 /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
 static u32 __read_mostly tsc_tolerance_ppm = 250;
@@ -233,7 +247,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
        VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped),
        VM_STAT("mmu_pte_write", mmu_pte_write),
-       VM_STAT("mmu_pte_updated", mmu_pte_updated),
        VM_STAT("mmu_pde_zapped", mmu_pde_zapped),
        VM_STAT("mmu_flooded", mmu_flooded),
        VM_STAT("mmu_recycled", mmu_recycled),
@@ -394,7 +407,7 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
        enum lapic_mode old_mode = kvm_get_apic_mode(vcpu);
        enum lapic_mode new_mode = kvm_apic_mode(msr_info->data);
-       u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) | 0x2ff |
+       u64 reserved_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu) | 0x2ff |
                (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE);
 
        if ((msr_info->data & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID)
@@ -483,19 +496,24 @@ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu)
                 */
                vcpu->arch.dr6 &= ~DR_TRAP_BITS;
                /*
-                * DR6.RTM is set by all #DB exceptions that don't clear it.
+                * In order to reflect the #DB exception payload in guest
+                * dr6, three components need to be considered: active low
+                * bit, FIXED_1 bits and active high bits (e.g. DR6_BD,
+                * DR6_BS and DR6_BT)
+                * DR6_ACTIVE_LOW contains the FIXED_1 and active low bits.
+                * In the target guest dr6:
+                * FIXED_1 bits should always be set.
+                * Active low bits should be cleared if 1-setting in payload.
+                * Active high bits should be set if 1-setting in payload.
+                *
+                * Note, the payload is compatible with the pending debug
+                * exceptions/exit qualification under VMX, that active_low bits
+                * are active high in payload.
+                * So they need to be flipped for DR6.
                 */
-               vcpu->arch.dr6 |= DR6_RTM;
+               vcpu->arch.dr6 |= DR6_ACTIVE_LOW;
                vcpu->arch.dr6 |= payload;
-               /*
-                * Bit 16 should be set in the payload whenever the #DB
-                * exception should clear DR6.RTM. This makes the payload
-                * compatible with the pending debug exceptions under VMX.
-                * Though not currently documented in the SDM, this also
-                * makes the payload compatible with the exit qualification
-                * for #DB exceptions under VMX.
-                */
-               vcpu->arch.dr6 ^= payload & DR6_RTM;
+               vcpu->arch.dr6 ^= payload & DR6_ACTIVE_LOW;
 
                /*
                 * The #DB payload is defined as compatible with the 'pending
@@ -691,7 +709,7 @@ EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
  */
 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
 {
-       if (kvm_x86_ops.get_cpl(vcpu) <= required_cpl)
+       if (static_call(kvm_x86_get_cpl)(vcpu) <= required_cpl)
                return true;
        kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
        return false;
@@ -741,8 +759,7 @@ static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
 
 static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
 {
-       return rsvd_bits(cpuid_maxphyaddr(vcpu), 63) | rsvd_bits(5, 8) |
-              rsvd_bits(1, 2);
+       return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2);
 }
 
 /*
@@ -851,7 +868,7 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 
                if (!is_pae(vcpu))
                        return 1;
-               kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+               static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
                if (cs_l)
                        return 1;
        }
@@ -864,7 +881,7 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
        if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
                return 1;
 
-       kvm_x86_ops.set_cr0(vcpu, cr0);
+       static_call(kvm_x86_set_cr0)(vcpu, cr0);
 
        kvm_post_set_cr0(vcpu, old_cr0, cr0);
 
@@ -969,12 +986,10 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
 
 int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
 {
-       if (kvm_x86_ops.get_cpl(vcpu) != 0 ||
-           __kvm_set_xcr(vcpu, index, xcr)) {
-               kvm_inject_gp(vcpu, 0);
-               return 1;
-       }
-       return 0;
+       if (static_call(kvm_x86_get_cpl)(vcpu) == 0)
+               return __kvm_set_xcr(vcpu, index, xcr);
+
+       return 1;
 }
 EXPORT_SYMBOL_GPL(kvm_set_xcr);
 
@@ -986,7 +1001,7 @@ bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
        if (cr4 & vcpu->arch.cr4_guest_rsvd_bits)
                return false;
 
-       return kvm_x86_ops.is_valid_cr4(vcpu, cr4);
+       return static_call(kvm_x86_is_valid_cr4)(vcpu, cr4);
 }
 EXPORT_SYMBOL_GPL(kvm_is_valid_cr4);
 
@@ -1030,7 +1045,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
                        return 1;
        }
 
-       kvm_x86_ops.set_cr4(vcpu, cr4);
+       static_call(kvm_x86_set_cr4)(vcpu, cr4);
 
        kvm_post_set_cr4(vcpu, old_cr4, cr4);
 
@@ -1058,8 +1073,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
                return 0;
        }
 
-       if (is_long_mode(vcpu) &&
-           (cr3 & vcpu->arch.cr3_lm_rsvd_bits))
+       if (is_long_mode(vcpu) && kvm_vcpu_is_illegal_gpa(vcpu, cr3))
                return 1;
        else if (is_pae_paging(vcpu) &&
                 !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
@@ -1113,7 +1127,7 @@ void kvm_update_dr7(struct kvm_vcpu *vcpu)
                dr7 = vcpu->arch.guest_debug_dr7;
        else
                dr7 = vcpu->arch.dr7;
-       kvm_x86_ops.set_dr7(vcpu, dr7);
+       static_call(kvm_x86_set_dr7)(vcpu, dr7);
        vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
        if (dr7 & DR7_BP_EN_MASK)
                vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
@@ -1129,7 +1143,7 @@ static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
        return fixed;
 }
 
-static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
+int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
 {
        size_t size = ARRAY_SIZE(vcpu->arch.db);
 
@@ -1142,13 +1156,13 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
        case 4:
        case 6:
                if (!kvm_dr6_valid(val))
-                       return -1; /* #GP */
+                       return 1; /* #GP */
                vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
                break;
        case 5:
        default: /* 7 */
                if (!kvm_dr7_valid(val))
-                       return -1; /* #GP */
+                       return 1; /* #GP */
                vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
                kvm_update_dr7(vcpu);
                break;
@@ -1156,18 +1170,9 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
 
        return 0;
 }
-
-int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
-{
-       if (__kvm_set_dr(vcpu, dr, val)) {
-               kvm_inject_gp(vcpu, 0);
-               return 1;
-       }
-       return 0;
-}
 EXPORT_SYMBOL_GPL(kvm_set_dr);
 
-int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
+void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
 {
        size_t size = ARRAY_SIZE(vcpu->arch.db);
 
@@ -1184,7 +1189,6 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
                *val = vcpu->arch.dr7;
                break;
        }
-       return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_get_dr);
 
@@ -1393,16 +1397,24 @@ static u64 kvm_get_arch_capabilities(void)
        if (!boot_cpu_has_bug(X86_BUG_MDS))
                data |= ARCH_CAP_MDS_NO;
 
-       /*
-        * On TAA affected systems:
-        *      - nothing to do if TSX is disabled on the host.
-        *      - we emulate TSX_CTRL if present on the host.
-        *        This lets the guest use VERW to clear CPU buffers.
-        */
-       if (!boot_cpu_has(X86_FEATURE_RTM))
-               data &= ~(ARCH_CAP_TAA_NO | ARCH_CAP_TSX_CTRL_MSR);
-       else if (!boot_cpu_has_bug(X86_BUG_TAA))
+       if (!boot_cpu_has(X86_FEATURE_RTM)) {
+               /*
+                * If RTM=0 because the kernel has disabled TSX, the host might
+                * have TAA_NO or TSX_CTRL.  Clear TAA_NO (the guest sees RTM=0
+                * and therefore knows that there cannot be TAA) but keep
+                * TSX_CTRL: some buggy userspaces leave it set on tsx=on hosts,
+                * and we want to allow migrating those guests to tsx=off hosts.
+                */
+               data &= ~ARCH_CAP_TAA_NO;
+       } else if (!boot_cpu_has_bug(X86_BUG_TAA)) {
                data |= ARCH_CAP_TAA_NO;
+       } else {
+               /*
+                * Nothing to do here; we emulate TSX_CTRL if present on the
+                * host so the guest can choose between disabling TSX or
+                * using VERW to clear CPU buffers.
+                */
+       }
 
        return data;
 }
@@ -1417,7 +1429,7 @@ static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
                rdmsrl_safe(msr->index, &msr->data);
                break;
        default:
-               return kvm_x86_ops.get_msr_feature(msr);
+               return static_call(kvm_x86_get_msr_feature)(msr);
        }
        return 0;
 }
@@ -1493,7 +1505,7 @@ static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
        efer &= ~EFER_LMA;
        efer |= vcpu->arch.efer & EFER_LMA;
 
-       r = kvm_x86_ops.set_efer(vcpu, efer);
+       r = static_call(kvm_x86_set_efer)(vcpu, efer);
        if (r) {
                WARN_ON(r > 0);
                return r;
@@ -1590,7 +1602,7 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
        msr.index = index;
        msr.host_initiated = host_initiated;
 
-       return kvm_x86_ops.set_msr(vcpu, &msr);
+       return static_call(kvm_x86_set_msr)(vcpu, &msr);
 }
 
 static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu,
@@ -1623,7 +1635,7 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
        msr.index = index;
        msr.host_initiated = host_initiated;
 
-       ret = kvm_x86_ops.get_msr(vcpu, &msr);
+       ret = static_call(kvm_x86_get_msr)(vcpu, &msr);
        if (!ret)
                *data = msr.data;
        return ret;
@@ -1664,12 +1676,12 @@ static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
                kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32);
        }
 
-       return kvm_x86_ops.complete_emulated_msr(vcpu, err);
+       return static_call(kvm_x86_complete_emulated_msr)(vcpu, err);
 }
 
 static int complete_emulated_wrmsr(struct kvm_vcpu *vcpu)
 {
-       return kvm_x86_ops.complete_emulated_msr(vcpu, vcpu->run->msr.error);
+       return static_call(kvm_x86_complete_emulated_msr)(vcpu, vcpu->run->msr.error);
 }
 
 static u64 kvm_msr_reason(int r)
@@ -1741,7 +1753,7 @@ int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
                trace_kvm_msr_read_ex(ecx);
        }
 
-       return kvm_x86_ops.complete_emulated_msr(vcpu, r);
+       return static_call(kvm_x86_complete_emulated_msr)(vcpu, r);
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr);
 
@@ -1767,16 +1779,15 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
        else
                trace_kvm_msr_write_ex(ecx, data);
 
-       return kvm_x86_ops.complete_emulated_msr(vcpu, r);
+       return static_call(kvm_x86_complete_emulated_msr)(vcpu, r);
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
 
-bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
+static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
 {
        return vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) ||
                xfer_to_guest_mode_work_pending();
 }
-EXPORT_SYMBOL_GPL(kvm_vcpu_exit_request);
 
 /*
  * The fast path for frequent and performance sensitive wrmsr emulation,
@@ -1926,15 +1937,14 @@ static s64 get_kvmclock_base_ns(void)
 }
 #endif
 
-static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
+void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs)
 {
        int version;
        int r;
        struct pvclock_wall_clock wc;
+       u32 wc_sec_hi;
        u64 wall_nsec;
 
-       kvm->arch.wall_clock = wall_clock;
-
        if (!wall_clock)
                return;
 
@@ -1963,6 +1973,12 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 
        kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
 
+       if (sec_hi_ofs) {
+               wc_sec_hi = wall_nsec >> 32;
+               kvm_write_guest(kvm, wall_clock + sec_hi_ofs,
+                               &wc_sec_hi, sizeof(wc_sec_hi));
+       }
+
        version++;
        kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
 }
@@ -2199,7 +2215,7 @@ EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
 static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 {
        vcpu->arch.l1_tsc_offset = offset;
-       vcpu->arch.tsc_offset = kvm_x86_ops.write_l1_tsc_offset(vcpu, offset);
+       vcpu->arch.tsc_offset = static_call(kvm_x86_write_l1_tsc_offset)(vcpu, offset);
 }
 
 static inline bool kvm_check_tsc_unstable(void)
@@ -2582,13 +2598,15 @@ u64 get_kvmclock_ns(struct kvm *kvm)
        return ret;
 }
 
-static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
+static void kvm_setup_pvclock_page(struct kvm_vcpu *v,
+                                  struct gfn_to_hva_cache *cache,
+                                  unsigned int offset)
 {
        struct kvm_vcpu_arch *vcpu = &v->arch;
        struct pvclock_vcpu_time_info guest_hv_clock;
 
-       if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
-               &guest_hv_clock, sizeof(guest_hv_clock))))
+       if (unlikely(kvm_read_guest_offset_cached(v->kvm, cache,
+               &guest_hv_clock, offset, sizeof(guest_hv_clock))))
                return;
 
        /* This VCPU is paused, but it's legal for a guest to read another
@@ -2611,9 +2629,9 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
                ++guest_hv_clock.version;  /* first time write, random junk */
 
        vcpu->hv_clock.version = guest_hv_clock.version + 1;
-       kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
-                               &vcpu->hv_clock,
-                               sizeof(vcpu->hv_clock.version));
+       kvm_write_guest_offset_cached(v->kvm, cache,
+                                     &vcpu->hv_clock, offset,
+                                     sizeof(vcpu->hv_clock.version));
 
        smp_wmb();
 
@@ -2627,16 +2645,16 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
 
        trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
 
-       kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
-                               &vcpu->hv_clock,
-                               sizeof(vcpu->hv_clock));
+       kvm_write_guest_offset_cached(v->kvm, cache,
+                                     &vcpu->hv_clock, offset,
+                                     sizeof(vcpu->hv_clock));
 
        smp_wmb();
 
        vcpu->hv_clock.version++;
-       kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
-                               &vcpu->hv_clock,
-                               sizeof(vcpu->hv_clock.version));
+       kvm_write_guest_offset_cached(v->kvm, cache,
+                                    &vcpu->hv_clock, offset,
+                                    sizeof(vcpu->hv_clock.version));
 }
 
 static int kvm_guest_time_update(struct kvm_vcpu *v)
@@ -2723,7 +2741,12 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
        vcpu->hv_clock.flags = pvclock_flags;
 
        if (vcpu->pv_time_enabled)
-               kvm_setup_pvclock_page(v);
+               kvm_setup_pvclock_page(v, &vcpu->pv_time, 0);
+       if (vcpu->xen.vcpu_info_set)
+               kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_info_cache,
+                                      offsetof(struct compat_vcpu_info, time));
+       if (vcpu->xen.vcpu_time_info_set)
+               kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_time_info_cache, 0);
        if (v == kvm_get_vcpu(v->kvm, 0))
                kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
        return 0;
@@ -2848,32 +2871,6 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
        return 0;
 }
 
-static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
-{
-       struct kvm *kvm = vcpu->kvm;
-       int lm = is_long_mode(vcpu);
-       u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64
-               : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
-       u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
-               : kvm->arch.xen_hvm_config.blob_size_32;
-       u32 page_num = data & ~PAGE_MASK;
-       u64 page_addr = data & PAGE_MASK;
-       u8 *page;
-
-       if (page_num >= blob_size)
-               return 1;
-
-       page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE);
-       if (IS_ERR(page))
-               return PTR_ERR(page);
-
-       if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) {
-               kfree(page);
-               return 1;
-       }
-       return 0;
-}
-
 static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu)
 {
        u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT;
@@ -2945,13 +2942,13 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu)
 static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu)
 {
        ++vcpu->stat.tlb_flush;
-       kvm_x86_ops.tlb_flush_all(vcpu);
+       static_call(kvm_x86_tlb_flush_all)(vcpu);
 }
 
 static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
 {
        ++vcpu->stat.tlb_flush;
-       kvm_x86_ops.tlb_flush_guest(vcpu);
+       static_call(kvm_x86_tlb_flush_guest)(vcpu);
 }
 
 static void record_steal_time(struct kvm_vcpu *vcpu)
@@ -3007,6 +3004,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
        u32 msr = msr_info->index;
        u64 data = msr_info->data;
 
+       if (msr && msr == vcpu->kvm->arch.xen_hvm_config.msr)
+               return kvm_xen_write_hypercall_page(vcpu, data);
+
        switch (msr) {
        case MSR_AMD64_NB_CFG:
        case MSR_IA32_UCODE_WRITE:
@@ -3063,18 +3063,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                        return 1;
                }
                break;
-       case MSR_IA32_DEBUGCTLMSR:
-               if (!data) {
-                       /* We support the non-activated case already */
-                       break;
-               } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
-                       /* Values other than LBR and BTF are vendor-specific,
-                          thus reserved and should throw a #GP */
-                       return 1;
-               } else if (report_ignored_msrs)
-                       vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
-                                   __func__, data);
-               break;
        case 0x200 ... 0x2ff:
                return kvm_mtrr_set_msr(vcpu, msr, data);
        case MSR_IA32_APICBASE:
@@ -3143,13 +3131,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
                        return 1;
 
-               kvm_write_wall_clock(vcpu->kvm, data);
+               vcpu->kvm->arch.wall_clock = data;
+               kvm_write_wall_clock(vcpu->kvm, data, 0);
                break;
        case MSR_KVM_WALL_CLOCK:
                if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
                        return 1;
 
-               kvm_write_wall_clock(vcpu->kvm, data);
+               vcpu->kvm->arch.wall_clock = data;
+               kvm_write_wall_clock(vcpu->kvm, data, 0);
                break;
        case MSR_KVM_SYSTEM_TIME_NEW:
                if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
@@ -3294,8 +3284,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                vcpu->arch.msr_misc_features_enables = data;
                break;
        default:
-               if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
-                       return xen_hvm_config(vcpu, data);
                if (kvm_pmu_is_valid_msr(vcpu, msr))
                        return kvm_pmu_set_msr(vcpu, msr_info);
                return KVM_MSR_RET_INVALID;
@@ -3347,7 +3335,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
        switch (msr_info->index) {
        case MSR_IA32_PLATFORM_ID:
        case MSR_IA32_EBL_CR_POWERON:
-       case MSR_IA32_DEBUGCTLMSR:
        case MSR_IA32_LASTBRANCHFROMIP:
        case MSR_IA32_LASTBRANCHTOIP:
        case MSR_IA32_LASTINTFROMIP:
@@ -3729,7 +3716,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_PIT2:
        case KVM_CAP_PIT_STATE2:
        case KVM_CAP_SET_IDENTITY_MAP_ADDR:
-       case KVM_CAP_XEN_HVM:
        case KVM_CAP_VCPU_EVENTS:
        case KVM_CAP_HYPERV:
        case KVM_CAP_HYPERV_VAPIC:
@@ -3769,6 +3755,11 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
                r = 1;
                break;
+       case KVM_CAP_XEN_HVM:
+               r = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR |
+                   KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
+                   KVM_XEN_HVM_CONFIG_SHARED_INFO;
+               break;
        case KVM_CAP_SYNC_REGS:
                r = KVM_SYNC_X86_VALID_FIELDS;
                break;
@@ -3790,10 +3781,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
                 * fringe case that is not enabled except via specific settings
                 * of the module parameters.
                 */
-               r = kvm_x86_ops.has_emulated_msr(kvm, MSR_IA32_SMBASE);
+               r = static_call(kvm_x86_has_emulated_msr)(kvm, MSR_IA32_SMBASE);
                break;
        case KVM_CAP_VAPIC:
-               r = !kvm_x86_ops.cpu_has_accelerated_tpr();
+               r = !static_call(kvm_x86_cpu_has_accelerated_tpr)();
                break;
        case KVM_CAP_NR_VCPUS:
                r = KVM_SOFT_MAX_VCPUS;
@@ -3835,6 +3826,13 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_STEAL_TIME:
                r = sched_info_on();
                break;
+       case KVM_CAP_X86_BUS_LOCK_EXIT:
+               if (kvm_has_bus_lock_exit)
+                       r = KVM_BUS_LOCK_DETECTION_OFF |
+                           KVM_BUS_LOCK_DETECTION_EXIT;
+               else
+                       r = 0;
+               break;
        default:
                break;
        }
@@ -3952,14 +3950,14 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
        /* Address WBINVD may be executed by guest */
        if (need_emulate_wbinvd(vcpu)) {
-               if (kvm_x86_ops.has_wbinvd_exit())
+               if (static_call(kvm_x86_has_wbinvd_exit)())
                        cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
                else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
                        smp_call_function_single(vcpu->cpu,
                                        wbinvd_ipi, NULL, 1);
        }
 
-       kvm_x86_ops.vcpu_load(vcpu, cpu);
+       static_call(kvm_x86_vcpu_load)(vcpu, cpu);
 
        /* Save host pkru register if supported */
        vcpu->arch.host_pkru = read_pkru();
@@ -4005,6 +4003,7 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
 {
        struct kvm_host_map map;
        struct kvm_steal_time *st;
+       int idx;
 
        if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
                return;
@@ -4012,9 +4011,15 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
        if (vcpu->arch.st.preempted)
                return;
 
+       /*
+        * Take the srcu lock as memslots will be accessed to check the gfn
+        * cache generation against the memslots generation.
+        */
+       idx = srcu_read_lock(&vcpu->kvm->srcu);
+
        if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map,
                        &vcpu->arch.st.cache, true))
-               return;
+               goto out;
 
        st = map.hva +
                offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
@@ -4022,33 +4027,18 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
        st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
 
        kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true);
+
+out:
+       srcu_read_unlock(&vcpu->kvm->srcu, idx);
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
-       int idx;
-
        if (vcpu->preempted && !vcpu->arch.guest_state_protected)
-               vcpu->arch.preempted_in_kernel = !kvm_x86_ops.get_cpl(vcpu);
+               vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu);
 
-       /*
-        * Disable page faults because we're in atomic context here.
-        * kvm_write_guest_offset_cached() would call might_fault()
-        * that relies on pagefault_disable() to tell if there's a
-        * bug. NOTE: the write to guest memory may not go through if
-        * during postcopy live migration or if there's heavy guest
-        * paging.
-        */
-       pagefault_disable();
-       /*
-        * kvm_memslots() will be called by
-        * kvm_write_guest_offset_cached() so take the srcu lock.
-        */
-       idx = srcu_read_lock(&vcpu->kvm->srcu);
        kvm_steal_time_set_preempted(vcpu);
-       srcu_read_unlock(&vcpu->kvm->srcu, idx);
-       pagefault_enable();
-       kvm_x86_ops.vcpu_put(vcpu);
+       static_call(kvm_x86_vcpu_put)(vcpu);
        vcpu->arch.last_host_tsc = rdtsc();
        /*
         * If userspace has set any breakpoints or watchpoints, dr6 is restored
@@ -4062,7 +4052,7 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
                                    struct kvm_lapic_state *s)
 {
        if (vcpu->arch.apicv_active)
-               kvm_x86_ops.sync_pir_to_irr(vcpu);
+               static_call(kvm_x86_sync_pir_to_irr)(vcpu);
 
        return kvm_apic_get_state(vcpu, s);
 }
@@ -4172,7 +4162,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
        for (bank = 0; bank < bank_num; bank++)
                vcpu->arch.mce_banks[bank*4] = ~(u64)0;
 
-       kvm_x86_ops.setup_mce(vcpu);
+       static_call(kvm_x86_setup_mce)(vcpu);
 out:
        return r;
 }
@@ -4230,6 +4220,9 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
 {
        process_nmi(vcpu);
 
+       if (kvm_check_request(KVM_REQ_SMI, vcpu))
+               process_smi(vcpu);
+
        /*
         * In guest mode, payload delivery should be deferred,
         * so that the L1 hypervisor can intercept #PF before
@@ -4276,11 +4269,11 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
                vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
        events->interrupt.nr = vcpu->arch.interrupt.nr;
        events->interrupt.soft = 0;
-       events->interrupt.shadow = kvm_x86_ops.get_interrupt_shadow(vcpu);
+       events->interrupt.shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
 
        events->nmi.injected = vcpu->arch.nmi_injected;
        events->nmi.pending = vcpu->arch.nmi_pending != 0;
-       events->nmi.masked = kvm_x86_ops.get_nmi_mask(vcpu);
+       events->nmi.masked = static_call(kvm_x86_get_nmi_mask)(vcpu);
        events->nmi.pad = 0;
 
        events->sipi_vector = 0; /* never valid when reporting to user space */
@@ -4347,13 +4340,13 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
        vcpu->arch.interrupt.nr = events->interrupt.nr;
        vcpu->arch.interrupt.soft = events->interrupt.soft;
        if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
-               kvm_x86_ops.set_interrupt_shadow(vcpu,
-                                                 events->interrupt.shadow);
+               static_call(kvm_x86_set_interrupt_shadow)(vcpu,
+                                               events->interrupt.shadow);
 
        vcpu->arch.nmi_injected = events->nmi.injected;
        if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
                vcpu->arch.nmi_pending = events->nmi.pending;
-       kvm_x86_ops.set_nmi_mask(vcpu, events->nmi.masked);
+       static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked);
 
        if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
            lapic_in_kernel(vcpu))
@@ -4409,9 +4402,9 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
        if (dbgregs->flags)
                return -EINVAL;
 
-       if (dbgregs->dr6 & ~0xffffffffull)
+       if (!kvm_dr6_valid(dbgregs->dr6))
                return -EINVAL;
-       if (dbgregs->dr7 & ~0xffffffffull)
+       if (!kvm_dr7_valid(dbgregs->dr7))
                return -EINVAL;
 
        memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
@@ -4648,7 +4641,7 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
                if (!kvm_x86_ops.enable_direct_tlbflush)
                        return -ENOTTY;
 
-               return kvm_x86_ops.enable_direct_tlbflush(vcpu);
+               return static_call(kvm_x86_enable_direct_tlbflush)(vcpu);
 
        case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
                vcpu->arch.pv_cpuid.enforce = cap->args[0];
@@ -5019,6 +5012,26 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
        case KVM_GET_SUPPORTED_HV_CPUID:
                r = kvm_ioctl_get_supported_hv_cpuid(vcpu, argp);
                break;
+       case KVM_XEN_VCPU_GET_ATTR: {
+               struct kvm_xen_vcpu_attr xva;
+
+               r = -EFAULT;
+               if (copy_from_user(&xva, argp, sizeof(xva)))
+                       goto out;
+               r = kvm_xen_vcpu_get_attr(vcpu, &xva);
+               if (!r && copy_to_user(argp, &xva, sizeof(xva)))
+                       r = -EFAULT;
+               break;
+       }
+       case KVM_XEN_VCPU_SET_ATTR: {
+               struct kvm_xen_vcpu_attr xva;
+
+               r = -EFAULT;
+               if (copy_from_user(&xva, argp, sizeof(xva)))
+                       goto out;
+               r = kvm_xen_vcpu_set_attr(vcpu, &xva);
+               break;
+       }
        default:
                r = -EINVAL;
        }
@@ -5040,14 +5053,14 @@ static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
 
        if (addr > (unsigned int)(-3 * PAGE_SIZE))
                return -EINVAL;
-       ret = kvm_x86_ops.set_tss_addr(kvm, addr);
+       ret = static_call(kvm_x86_set_tss_addr)(kvm, addr);
        return ret;
 }
 
 static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
                                              u64 ident_addr)
 {
-       return kvm_x86_ops.set_identity_map_addr(kvm, ident_addr);
+       return static_call(kvm_x86_set_identity_map_addr)(kvm, ident_addr);
 }
 
 static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
@@ -5204,8 +5217,7 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
        /*
         * Flush potentially hardware-cached dirty pages to dirty_bitmap.
         */
-       if (kvm_x86_ops.flush_log_dirty)
-               kvm_x86_ops.flush_log_dirty(kvm);
+       static_call_cond(kvm_x86_flush_log_dirty)(kvm);
 }
 
 int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
@@ -5295,6 +5307,20 @@ split_irqchip_unlock:
                kvm->arch.user_space_msr_mask = cap->args[0];
                r = 0;
                break;
+       case KVM_CAP_X86_BUS_LOCK_EXIT:
+               r = -EINVAL;
+               if (cap->args[0] & ~KVM_BUS_LOCK_DETECTION_VALID_MODE)
+                       break;
+
+               if ((cap->args[0] & KVM_BUS_LOCK_DETECTION_OFF) &&
+                   (cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT))
+                       break;
+
+               if (kvm_has_bus_lock_exit &&
+                   cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT)
+                       kvm->arch.bus_lock_detection_enabled = true;
+               r = 0;
+               break;
        default:
                r = -EINVAL;
                break;
@@ -5624,11 +5650,27 @@ set_pit2_out:
                r = -EFAULT;
                if (copy_from_user(&xhc, argp, sizeof(xhc)))
                        goto out;
-               r = -EINVAL;
-               if (xhc.flags)
+               r = kvm_xen_hvm_config(kvm, &xhc);
+               break;
+       }
+       case KVM_XEN_HVM_GET_ATTR: {
+               struct kvm_xen_hvm_attr xha;
+
+               r = -EFAULT;
+               if (copy_from_user(&xha, argp, sizeof(xha)))
                        goto out;
-               memcpy(&kvm->arch.xen_hvm_config, &xhc, sizeof(xhc));
-               r = 0;
+               r = kvm_xen_hvm_get_attr(kvm, &xha);
+               if (!r && copy_to_user(argp, &xha, sizeof(xha)))
+                       r = -EFAULT;
+               break;
+       }
+       case KVM_XEN_HVM_SET_ATTR: {
+               struct kvm_xen_hvm_attr xha;
+
+               r = -EFAULT;
+               if (copy_from_user(&xha, argp, sizeof(xha)))
+                       goto out;
+               r = kvm_xen_hvm_set_attr(kvm, &xha);
                break;
        }
        case KVM_SET_CLOCK: {
@@ -5673,7 +5715,7 @@ set_pit2_out:
        case KVM_MEMORY_ENCRYPT_OP: {
                r = -ENOTTY;
                if (kvm_x86_ops.mem_enc_op)
-                       r = kvm_x86_ops.mem_enc_op(kvm, argp);
+                       r = static_call(kvm_x86_mem_enc_op)(kvm, argp);
                break;
        }
        case KVM_MEMORY_ENCRYPT_REG_REGION: {
@@ -5685,7 +5727,7 @@ set_pit2_out:
 
                r = -ENOTTY;
                if (kvm_x86_ops.mem_enc_reg_region)
-                       r = kvm_x86_ops.mem_enc_reg_region(kvm, &region);
+                       r = static_call(kvm_x86_mem_enc_reg_region)(kvm, &region);
                break;
        }
        case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
@@ -5697,7 +5739,7 @@ set_pit2_out:
 
                r = -ENOTTY;
                if (kvm_x86_ops.mem_enc_unreg_region)
-                       r = kvm_x86_ops.mem_enc_unreg_region(kvm, &region);
+                       r = static_call(kvm_x86_mem_enc_unreg_region)(kvm, &region);
                break;
        }
        case KVM_HYPERV_EVENTFD: {
@@ -5799,7 +5841,7 @@ static void kvm_init_msr_list(void)
        }
 
        for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) {
-               if (!kvm_x86_ops.has_emulated_msr(NULL, emulated_msrs_all[i]))
+               if (!static_call(kvm_x86_has_emulated_msr)(NULL, emulated_msrs_all[i]))
                        continue;
 
                emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i];
@@ -5862,13 +5904,13 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
 static void kvm_set_segment(struct kvm_vcpu *vcpu,
                        struct kvm_segment *var, int seg)
 {
-       kvm_x86_ops.set_segment(vcpu, var, seg);
+       static_call(kvm_x86_set_segment)(vcpu, var, seg);
 }
 
 void kvm_get_segment(struct kvm_vcpu *vcpu,
                     struct kvm_segment *var, int seg)
 {
-       kvm_x86_ops.get_segment(vcpu, var, seg);
+       static_call(kvm_x86_get_segment)(vcpu, var, seg);
 }
 
 gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
@@ -5888,14 +5930,14 @@ gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
 gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
                              struct x86_exception *exception)
 {
-       u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+       u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
        return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
 }
 
  gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
                                struct x86_exception *exception)
 {
-       u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+       u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
        access |= PFERR_FETCH_MASK;
        return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
 }
@@ -5903,7 +5945,7 @@ gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
 gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
                               struct x86_exception *exception)
 {
-       u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+       u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
        access |= PFERR_WRITE_MASK;
        return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
 }
@@ -5952,7 +5994,7 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
                                struct x86_exception *exception)
 {
        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
-       u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+       u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
        unsigned offset;
        int ret;
 
@@ -5977,7 +6019,7 @@ int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
                               gva_t addr, void *val, unsigned int bytes,
                               struct x86_exception *exception)
 {
-       u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+       u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
 
        /*
         * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
@@ -5998,7 +6040,7 @@ static int emulator_read_std(struct x86_emulate_ctxt *ctxt,
        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
        u32 access = 0;
 
-       if (!system && kvm_x86_ops.get_cpl(vcpu) == 3)
+       if (!system && static_call(kvm_x86_get_cpl)(vcpu) == 3)
                access |= PFERR_USER_MASK;
 
        return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception);
@@ -6051,7 +6093,7 @@ static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *v
        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
        u32 access = PFERR_WRITE_MASK;
 
-       if (!system && kvm_x86_ops.get_cpl(vcpu) == 3)
+       if (!system && static_call(kvm_x86_get_cpl)(vcpu) == 3)
                access |= PFERR_USER_MASK;
 
        return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
@@ -6076,7 +6118,7 @@ int handle_ud(struct kvm_vcpu *vcpu)
        char sig[5]; /* ud2; .ascii "kvm" */
        struct x86_exception e;
 
-       if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, NULL, 0)))
+       if (unlikely(!static_call(kvm_x86_can_emulate_instruction)(vcpu, NULL, 0)))
                return 1;
 
        if (force_emulation_prefix &&
@@ -6110,7 +6152,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
                                gpa_t *gpa, struct x86_exception *exception,
                                bool write)
 {
-       u32 access = ((kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
+       u32 access = ((static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0)
                | (write ? PFERR_WRITE_MASK : 0);
 
        /*
@@ -6518,7 +6560,7 @@ static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
 
 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
 {
-       return kvm_x86_ops.get_segment_base(vcpu, seg);
+       return static_call(kvm_x86_get_segment_base)(vcpu, seg);
 }
 
 static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
@@ -6531,7 +6573,7 @@ static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu)
        if (!need_emulate_wbinvd(vcpu))
                return X86EMUL_CONTINUE;
 
-       if (kvm_x86_ops.has_wbinvd_exit()) {
+       if (static_call(kvm_x86_has_wbinvd_exit)()) {
                int cpu = get_cpu();
 
                cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
@@ -6558,17 +6600,17 @@ static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
        kvm_emulate_wbinvd_noskip(emul_to_vcpu(ctxt));
 }
 
-static int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
-                          unsigned long *dest)
+static void emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
+                           unsigned long *dest)
 {
-       return kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
+       kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
 }
 
 static int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
                           unsigned long value)
 {
 
-       return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value);
+       return kvm_set_dr(emul_to_vcpu(ctxt), dr, value);
 }
 
 static u64 mk_cr_64(u64 curr_cr, u32 new_val)
@@ -6636,27 +6678,27 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
 
 static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
 {
-       return kvm_x86_ops.get_cpl(emul_to_vcpu(ctxt));
+       return static_call(kvm_x86_get_cpl)(emul_to_vcpu(ctxt));
 }
 
 static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
 {
-       kvm_x86_ops.get_gdt(emul_to_vcpu(ctxt), dt);
+       static_call(kvm_x86_get_gdt)(emul_to_vcpu(ctxt), dt);
 }
 
 static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
 {
-       kvm_x86_ops.get_idt(emul_to_vcpu(ctxt), dt);
+       static_call(kvm_x86_get_idt)(emul_to_vcpu(ctxt), dt);
 }
 
 static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
 {
-       kvm_x86_ops.set_gdt(emul_to_vcpu(ctxt), dt);
+       static_call(kvm_x86_set_gdt)(emul_to_vcpu(ctxt), dt);
 }
 
 static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
 {
-       kvm_x86_ops.set_idt(emul_to_vcpu(ctxt), dt);
+       static_call(kvm_x86_set_idt)(emul_to_vcpu(ctxt), dt);
 }
 
 static unsigned long emulator_get_cached_segment_base(
@@ -6798,7 +6840,7 @@ static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
                              struct x86_instruction_info *info,
                              enum x86_intercept_stage stage)
 {
-       return kvm_x86_ops.check_intercept(emul_to_vcpu(ctxt), info, stage,
+       return static_call(kvm_x86_check_intercept)(emul_to_vcpu(ctxt), info, stage,
                                            &ctxt->exception);
 }
 
@@ -6836,7 +6878,7 @@ static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulon
 
 static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked)
 {
-       kvm_x86_ops.set_nmi_mask(emul_to_vcpu(ctxt), masked);
+       static_call(kvm_x86_set_nmi_mask)(emul_to_vcpu(ctxt), masked);
 }
 
 static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt)
@@ -6852,7 +6894,7 @@ static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_fla
 static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt,
                                  const char *smstate)
 {
-       return kvm_x86_ops.pre_leave_smm(emul_to_vcpu(ctxt), smstate);
+       return static_call(kvm_x86_pre_leave_smm)(emul_to_vcpu(ctxt), smstate);
 }
 
 static void emulator_post_leave_smm(struct x86_emulate_ctxt *ctxt)
@@ -6914,7 +6956,7 @@ static const struct x86_emulate_ops emulate_ops = {
 
 static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
 {
-       u32 int_shadow = kvm_x86_ops.get_interrupt_shadow(vcpu);
+       u32 int_shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
        /*
         * an sti; sti; sequence only disable interrupts for the first
         * instruction. So, if the last instruction, be it emulated or
@@ -6925,7 +6967,7 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
        if (int_shadow & mask)
                mask = 0;
        if (unlikely(int_shadow || mask)) {
-               kvm_x86_ops.set_interrupt_shadow(vcpu, mask);
+               static_call(kvm_x86_set_interrupt_shadow)(vcpu, mask);
                if (!mask)
                        kvm_make_request(KVM_REQ_EVENT, vcpu);
        }
@@ -6967,7 +7009,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
        struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
        int cs_db, cs_l;
 
-       kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+       static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
 
        ctxt->gpa_available = false;
        ctxt->eflags = kvm_get_rflags(vcpu);
@@ -7028,7 +7070,7 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
 
        kvm_queue_exception(vcpu, UD_VECTOR);
 
-       if (!is_guest_mode(vcpu) && kvm_x86_ops.get_cpl(vcpu) == 0) {
+       if (!is_guest_mode(vcpu) && static_call(kvm_x86_get_cpl)(vcpu) == 0) {
                vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
                vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
                vcpu->run->internal.ndata = 0;
@@ -7088,9 +7130,9 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
        if (vcpu->arch.mmu->direct_map) {
                unsigned int indirect_shadow_pages;
 
-               spin_lock(&vcpu->kvm->mmu_lock);
+               write_lock(&vcpu->kvm->mmu_lock);
                indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
-               spin_unlock(&vcpu->kvm->mmu_lock);
+               write_unlock(&vcpu->kvm->mmu_lock);
 
                if (indirect_shadow_pages)
                        kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
@@ -7197,7 +7239,7 @@ static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu)
        struct kvm_run *kvm_run = vcpu->run;
 
        if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
-               kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | DR6_RTM;
+               kvm_run->debug.arch.dr6 = DR6_BS | DR6_ACTIVE_LOW;
                kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu);
                kvm_run->debug.arch.exception = DB_VECTOR;
                kvm_run->exit_reason = KVM_EXIT_DEBUG;
@@ -7209,10 +7251,10 @@ static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu)
 
 int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
 {
-       unsigned long rflags = kvm_x86_ops.get_rflags(vcpu);
+       unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu);
        int r;
 
-       r = kvm_x86_ops.skip_emulated_instruction(vcpu);
+       r = static_call(kvm_x86_skip_emulated_instruction)(vcpu);
        if (unlikely(!r))
                return 0;
 
@@ -7241,7 +7283,7 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
                                           vcpu->arch.eff_db);
 
                if (dr6 != 0) {
-                       kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1 | DR6_RTM;
+                       kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW;
                        kvm_run->debug.arch.pc = eip;
                        kvm_run->debug.arch.exception = DB_VECTOR;
                        kvm_run->exit_reason = KVM_EXIT_DEBUG;
@@ -7298,6 +7340,42 @@ static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt)
        return false;
 }
 
+/*
+ * Decode to be emulated instruction. Return EMULATION_OK if success.
+ */
+int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type,
+                                   void *insn, int insn_len)
+{
+       int r = EMULATION_OK;
+       struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
+
+       init_emulate_ctxt(vcpu);
+
+       /*
+        * We will reenter on the same instruction since we do not set
+        * complete_userspace_io. This does not handle watchpoints yet,
+        * those would be handled in the emulate_ops.
+        */
+       if (!(emulation_type & EMULTYPE_SKIP) &&
+           kvm_vcpu_check_breakpoint(vcpu, &r))
+               return r;
+
+       ctxt->interruptibility = 0;
+       ctxt->have_exception = false;
+       ctxt->exception.vector = -1;
+       ctxt->perm_ok = false;
+
+       ctxt->ud = emulation_type & EMULTYPE_TRAP_UD;
+
+       r = x86_decode_insn(ctxt, insn, insn_len);
+
+       trace_kvm_emulate_insn_start(vcpu);
+       ++vcpu->stat.insn_emulation;
+
+       return r;
+}
+EXPORT_SYMBOL_GPL(x86_decode_emulated_instruction);
+
 int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
                            int emulation_type, void *insn, int insn_len)
 {
@@ -7306,7 +7384,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
        bool writeback = true;
        bool write_fault_to_spt;
 
-       if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, insn, insn_len)))
+       if (unlikely(!static_call(kvm_x86_can_emulate_instruction)(vcpu, insn, insn_len)))
                return 1;
 
        vcpu->arch.l1tf_flush_l1d = true;
@@ -7317,32 +7395,12 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
         */
        write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
        vcpu->arch.write_fault_to_shadow_pgtable = false;
-       kvm_clear_exception_queue(vcpu);
 
        if (!(emulation_type & EMULTYPE_NO_DECODE)) {
-               init_emulate_ctxt(vcpu);
-
-               /*
-                * We will reenter on the same instruction since
-                * we do not set complete_userspace_io.  This does not
-                * handle watchpoints yet, those would be handled in
-                * the emulate_ops.
-                */
-               if (!(emulation_type & EMULTYPE_SKIP) &&
-                   kvm_vcpu_check_breakpoint(vcpu, &r))
-                       return r;
-
-               ctxt->interruptibility = 0;
-               ctxt->have_exception = false;
-               ctxt->exception.vector = -1;
-               ctxt->perm_ok = false;
+               kvm_clear_exception_queue(vcpu);
 
-               ctxt->ud = emulation_type & EMULTYPE_TRAP_UD;
-
-               r = x86_decode_insn(ctxt, insn, insn_len);
-
-               trace_kvm_emulate_insn_start(vcpu);
-               ++vcpu->stat.insn_emulation;
+               r = x86_decode_emulated_instruction(vcpu, emulation_type,
+                                                   insn, insn_len);
                if (r != EMULATION_OK)  {
                        if ((emulation_type & EMULTYPE_TRAP_UD) ||
                            (emulation_type & EMULTYPE_TRAP_UD_FORCED)) {
@@ -7449,7 +7507,7 @@ restart:
                r = 1;
 
        if (writeback) {
-               unsigned long rflags = kvm_x86_ops.get_rflags(vcpu);
+               unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu);
                toggle_interruptibility(vcpu, ctxt->interruptibility);
                vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
                if (!ctxt->have_exception ||
@@ -7458,7 +7516,7 @@ restart:
                        if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
                                r = kvm_vcpu_do_singlestep(vcpu);
                        if (kvm_x86_ops.update_emulated_instruction)
-                               kvm_x86_ops.update_emulated_instruction(vcpu);
+                               static_call(kvm_x86_update_emulated_instruction)(vcpu);
                        __kvm_set_rflags(vcpu, ctxt->eflags);
                }
 
@@ -7787,7 +7845,7 @@ static int kvm_is_user_mode(void)
        int user_mode = 3;
 
        if (__this_cpu_read(current_vcpu))
-               user_mode = kvm_x86_ops.get_cpl(__this_cpu_read(current_vcpu));
+               user_mode = static_call(kvm_x86_get_cpl)(__this_cpu_read(current_vcpu));
 
        return user_mode != 0;
 }
@@ -7932,7 +7990,6 @@ int kvm_arch_init(void *opaque)
                supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
        }
 
-       kvm_lapic_init();
        if (pi_inject_timer == -1)
                pi_inject_timer = housekeeping_enabled(HK_FLAG_TIMER);
 #ifdef CONFIG_X86_64
@@ -7974,6 +8031,7 @@ void kvm_arch_exit(void)
        kvm_mmu_module_exit();
        free_percpu(user_return_msrs);
        kmem_cache_destroy(x86_fpu_cache);
+       WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key));
 }
 
 static int __kvm_vcpu_halt(struct kvm_vcpu *vcpu, int state, int reason)
@@ -8025,7 +8083,7 @@ static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
        if (clock_type != KVM_CLOCK_PAIRING_WALLCLOCK)
                return -KVM_EOPNOTSUPP;
 
-       if (kvm_get_walltime_and_clockread(&ts, &cycle) == false)
+       if (!kvm_get_walltime_and_clockread(&ts, &cycle))
                return -KVM_EOPNOTSUPP;
 
        clock_pairing.sec = ts.tv_sec;
@@ -8101,7 +8159,10 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
        unsigned long nr, a0, a1, a2, a3, ret;
        int op_64_bit;
 
-       if (kvm_hv_hypercall_enabled(vcpu->kvm))
+       if (kvm_xen_hypercall_enabled(vcpu->kvm))
+               return kvm_xen_hypercall(vcpu);
+
+       if (kvm_hv_hypercall_enabled(vcpu))
                return kvm_hv_hypercall(vcpu);
 
        nr = kvm_rax_read(vcpu);
@@ -8121,7 +8182,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
                a3 &= 0xFFFFFFFF;
        }
 
-       if (kvm_x86_ops.get_cpl(vcpu) != 0) {
+       if (static_call(kvm_x86_get_cpl)(vcpu) != 0) {
                ret = -KVM_EPERM;
                goto out;
        }
@@ -8178,7 +8239,7 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
        char instruction[3];
        unsigned long rip = kvm_rip_read(vcpu);
 
-       kvm_x86_ops.patch_hypercall(vcpu, instruction);
+       static_call(kvm_x86_patch_hypercall)(vcpu, instruction);
 
        return emulator_write_emulated(ctxt, rip, instruction, 3,
                &ctxt->exception);
@@ -8202,12 +8263,14 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
        kvm_run->if_flag = !vcpu->arch.guest_state_protected
                && (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
 
-       kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0;
        kvm_run->cr8 = kvm_get_cr8(vcpu);
        kvm_run->apic_base = kvm_get_apic_base(vcpu);
        kvm_run->ready_for_interrupt_injection =
                pic_in_kernel(vcpu->kvm) ||
                kvm_vcpu_ready_for_interrupt_injection(vcpu);
+
+       if (is_smm(vcpu))
+               kvm_run->flags |= KVM_RUN_X86_SMM;
 }
 
 static void update_cr8_intercept(struct kvm_vcpu *vcpu)
@@ -8233,7 +8296,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
 
        tpr = kvm_lapic_get_cr8(vcpu);
 
-       kvm_x86_ops.update_cr8_intercept(vcpu, tpr, max_irr);
+       static_call(kvm_x86_update_cr8_intercept)(vcpu, tpr, max_irr);
 }
 
 static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
@@ -8244,7 +8307,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
        /* try to reinject previous events if any */
 
        if (vcpu->arch.exception.injected) {
-               kvm_x86_ops.queue_exception(vcpu);
+               static_call(kvm_x86_queue_exception)(vcpu);
                can_inject = false;
        }
        /*
@@ -8263,10 +8326,10 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
         */
        else if (!vcpu->arch.exception.pending) {
                if (vcpu->arch.nmi_injected) {
-                       kvm_x86_ops.set_nmi(vcpu);
+                       static_call(kvm_x86_set_nmi)(vcpu);
                        can_inject = false;
                } else if (vcpu->arch.interrupt.injected) {
-                       kvm_x86_ops.set_irq(vcpu);
+                       static_call(kvm_x86_set_irq)(vcpu);
                        can_inject = false;
                }
        }
@@ -8307,7 +8370,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
                        }
                }
 
-               kvm_x86_ops.queue_exception(vcpu);
+               static_call(kvm_x86_queue_exception)(vcpu);
                can_inject = false;
        }
 
@@ -8323,7 +8386,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
         * The kvm_x86_ops hooks communicate this by returning -EBUSY.
         */
        if (vcpu->arch.smi_pending) {
-               r = can_inject ? kvm_x86_ops.smi_allowed(vcpu, true) : -EBUSY;
+               r = can_inject ? static_call(kvm_x86_smi_allowed)(vcpu, true) : -EBUSY;
                if (r < 0)
                        goto busy;
                if (r) {
@@ -8332,35 +8395,35 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
                        enter_smm(vcpu);
                        can_inject = false;
                } else
-                       kvm_x86_ops.enable_smi_window(vcpu);
+                       static_call(kvm_x86_enable_smi_window)(vcpu);
        }
 
        if (vcpu->arch.nmi_pending) {
-               r = can_inject ? kvm_x86_ops.nmi_allowed(vcpu, true) : -EBUSY;
+               r = can_inject ? static_call(kvm_x86_nmi_allowed)(vcpu, true) : -EBUSY;
                if (r < 0)
                        goto busy;
                if (r) {
                        --vcpu->arch.nmi_pending;
                        vcpu->arch.nmi_injected = true;
-                       kvm_x86_ops.set_nmi(vcpu);
+                       static_call(kvm_x86_set_nmi)(vcpu);
                        can_inject = false;
-                       WARN_ON(kvm_x86_ops.nmi_allowed(vcpu, true) < 0);
+                       WARN_ON(static_call(kvm_x86_nmi_allowed)(vcpu, true) < 0);
                }
                if (vcpu->arch.nmi_pending)
-                       kvm_x86_ops.enable_nmi_window(vcpu);
+                       static_call(kvm_x86_enable_nmi_window)(vcpu);
        }
 
        if (kvm_cpu_has_injectable_intr(vcpu)) {
-               r = can_inject ? kvm_x86_ops.interrupt_allowed(vcpu, true) : -EBUSY;
+               r = can_inject ? static_call(kvm_x86_interrupt_allowed)(vcpu, true) : -EBUSY;
                if (r < 0)
                        goto busy;
                if (r) {
                        kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false);
-                       kvm_x86_ops.set_irq(vcpu);
-                       WARN_ON(kvm_x86_ops.interrupt_allowed(vcpu, true) < 0);
+                       static_call(kvm_x86_set_irq)(vcpu);
+                       WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0);
                }
                if (kvm_cpu_has_injectable_intr(vcpu))
-                       kvm_x86_ops.enable_irq_window(vcpu);
+                       static_call(kvm_x86_enable_irq_window)(vcpu);
        }
 
        if (is_guest_mode(vcpu) &&
@@ -8385,7 +8448,7 @@ static void process_nmi(struct kvm_vcpu *vcpu)
         * If an NMI is already in progress, limit further NMIs to just one.
         * Otherwise, allow two (and we'll inject the first one immediately).
         */
-       if (kvm_x86_ops.get_nmi_mask(vcpu) || vcpu->arch.nmi_injected)
+       if (static_call(kvm_x86_get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected)
                limit = 1;
 
        vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
@@ -8475,11 +8538,11 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf)
        put_smstate(u32, buf, 0x7f7c, seg.limit);
        put_smstate(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg));
 
-       kvm_x86_ops.get_gdt(vcpu, &dt);
+       static_call(kvm_x86_get_gdt)(vcpu, &dt);
        put_smstate(u32, buf, 0x7f74, dt.address);
        put_smstate(u32, buf, 0x7f70, dt.size);
 
-       kvm_x86_ops.get_idt(vcpu, &dt);
+       static_call(kvm_x86_get_idt)(vcpu, &dt);
        put_smstate(u32, buf, 0x7f58, dt.address);
        put_smstate(u32, buf, 0x7f54, dt.size);
 
@@ -8529,7 +8592,7 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf)
        put_smstate(u32, buf, 0x7e94, seg.limit);
        put_smstate(u64, buf, 0x7e98, seg.base);
 
-       kvm_x86_ops.get_idt(vcpu, &dt);
+       static_call(kvm_x86_get_idt)(vcpu, &dt);
        put_smstate(u32, buf, 0x7e84, dt.size);
        put_smstate(u64, buf, 0x7e88, dt.address);
 
@@ -8539,7 +8602,7 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf)
        put_smstate(u32, buf, 0x7e74, seg.limit);
        put_smstate(u64, buf, 0x7e78, seg.base);
 
-       kvm_x86_ops.get_gdt(vcpu, &dt);
+       static_call(kvm_x86_get_gdt)(vcpu, &dt);
        put_smstate(u32, buf, 0x7e64, dt.size);
        put_smstate(u64, buf, 0x7e68, dt.address);
 
@@ -8569,30 +8632,30 @@ static void enter_smm(struct kvm_vcpu *vcpu)
         * vCPU state (e.g. leave guest mode) after we've saved the state into
         * the SMM state-save area.
         */
-       kvm_x86_ops.pre_enter_smm(vcpu, buf);
+       static_call(kvm_x86_pre_enter_smm)(vcpu, buf);
 
        vcpu->arch.hflags |= HF_SMM_MASK;
        kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
 
-       if (kvm_x86_ops.get_nmi_mask(vcpu))
+       if (static_call(kvm_x86_get_nmi_mask)(vcpu))
                vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
        else
-               kvm_x86_ops.set_nmi_mask(vcpu, true);
+               static_call(kvm_x86_set_nmi_mask)(vcpu, true);
 
        kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
        kvm_rip_write(vcpu, 0x8000);
 
        cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG);
-       kvm_x86_ops.set_cr0(vcpu, cr0);
+       static_call(kvm_x86_set_cr0)(vcpu, cr0);
        vcpu->arch.cr0 = cr0;
 
-       kvm_x86_ops.set_cr4(vcpu, 0);
+       static_call(kvm_x86_set_cr4)(vcpu, 0);
 
        /* Undocumented: IDT limit is set to zero on entry to SMM.  */
        dt.address = dt.size = 0;
-       kvm_x86_ops.set_idt(vcpu, &dt);
+       static_call(kvm_x86_set_idt)(vcpu, &dt);
 
-       __kvm_set_dr(vcpu, 7, DR7_FIXED_1);
+       kvm_set_dr(vcpu, 7, DR7_FIXED_1);
 
        cs.selector = (vcpu->arch.smbase >> 4) & 0xffff;
        cs.base = vcpu->arch.smbase;
@@ -8621,7 +8684,7 @@ static void enter_smm(struct kvm_vcpu *vcpu)
 
 #ifdef CONFIG_X86_64
        if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
-               kvm_x86_ops.set_efer(vcpu, 0);
+               static_call(kvm_x86_set_efer)(vcpu, 0);
 #endif
 
        kvm_update_cpuid_runtime(vcpu);
@@ -8659,7 +8722,7 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
 
        vcpu->arch.apicv_active = kvm_apicv_activated(vcpu->kvm);
        kvm_apic_update_apicv(vcpu);
-       kvm_x86_ops.refresh_apicv_exec_ctrl(vcpu);
+       static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv);
 
@@ -8676,7 +8739,7 @@ void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
        unsigned long old, new, expected;
 
        if (!kvm_x86_ops.check_apicv_inhibit_reasons ||
-           !kvm_x86_ops.check_apicv_inhibit_reasons(bit))
+           !static_call(kvm_x86_check_apicv_inhibit_reasons)(bit))
                return;
 
        old = READ_ONCE(kvm->arch.apicv_inhibit_reasons);
@@ -8696,7 +8759,7 @@ void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
 
        trace_kvm_apicv_update_request(activate, bit);
        if (kvm_x86_ops.pre_update_apicv_exec_ctrl)
-               kvm_x86_ops.pre_update_apicv_exec_ctrl(kvm, activate);
+               static_call(kvm_x86_pre_update_apicv_exec_ctrl)(kvm, activate);
 
        /*
         * Sending request to update APICV for all other vcpus,
@@ -8722,7 +8785,7 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
                kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
        else {
                if (vcpu->arch.apicv_active)
-                       kvm_x86_ops.sync_pir_to_irr(vcpu);
+                       static_call(kvm_x86_sync_pir_to_irr)(vcpu);
                if (ioapic_in_kernel(vcpu->kvm))
                        kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
        }
@@ -8740,9 +8803,12 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu)
        if (!kvm_apic_hw_enabled(vcpu->arch.apic))
                return;
 
-       bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors,
-                 vcpu_to_synic(vcpu)->vec_bitmap, 256);
-       kvm_x86_ops.load_eoi_exitmap(vcpu, eoi_exit_bitmap);
+       if (to_hv_vcpu(vcpu))
+               bitmap_or((ulong *)eoi_exit_bitmap,
+                         vcpu->arch.ioapic_handled_vectors,
+                         to_hv_synic(vcpu)->vec_bitmap, 256);
+
+       static_call(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap);
 }
 
 void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
@@ -8767,7 +8833,7 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
        if (!kvm_x86_ops.set_apic_access_page_addr)
                return;
 
-       kvm_x86_ops.set_apic_access_page_addr(vcpu);
+       static_call(kvm_x86_set_apic_access_page_addr)(vcpu);
 }
 
 void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
@@ -8802,9 +8868,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
        if (kvm_request_pending(vcpu)) {
                if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
-                       if (WARN_ON_ONCE(!is_guest_mode(vcpu)))
-                               ;
-                       else if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) {
+                       if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) {
                                r = 0;
                                goto out;
                        }
@@ -8894,8 +8958,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                        goto out;
                }
                if (kvm_check_request(KVM_REQ_HV_EXIT, vcpu)) {
+                       struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
                        vcpu->run->exit_reason = KVM_EXIT_HYPERV;
-                       vcpu->run->hyperv = vcpu->arch.hyperv.exit;
+                       vcpu->run->hyperv = hv_vcpu->exit;
                        r = 0;
                        goto out;
                }
@@ -8912,10 +8978,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                if (kvm_check_request(KVM_REQ_APF_READY, vcpu))
                        kvm_check_async_pf_completion(vcpu);
                if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu))
-                       kvm_x86_ops.msr_filter_changed(vcpu);
+                       static_call(kvm_x86_msr_filter_changed)(vcpu);
        }
 
-       if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
+       if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win ||
+           kvm_xen_has_interrupt(vcpu)) {
                ++vcpu->stat.req_event;
                kvm_apic_accept_events(vcpu);
                if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
@@ -8925,7 +8992,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
                inject_pending_event(vcpu, &req_immediate_exit);
                if (req_int_win)
-                       kvm_x86_ops.enable_irq_window(vcpu);
+                       static_call(kvm_x86_enable_irq_window)(vcpu);
 
                if (kvm_lapic_enabled(vcpu)) {
                        update_cr8_intercept(vcpu);
@@ -8940,7 +9007,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
        preempt_disable();
 
-       kvm_x86_ops.prepare_guest_switch(vcpu);
+       static_call(kvm_x86_prepare_guest_switch)(vcpu);
 
        /*
         * Disable IRQs before setting IN_GUEST_MODE.  Posted interrupt
@@ -8971,7 +9038,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
         * notified with kvm_vcpu_kick.
         */
        if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active)
-               kvm_x86_ops.sync_pir_to_irr(vcpu);
+               static_call(kvm_x86_sync_pir_to_irr)(vcpu);
 
        if (kvm_vcpu_exit_request(vcpu)) {
                vcpu->mode = OUTSIDE_GUEST_MODE;
@@ -8985,11 +9052,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
        if (req_immediate_exit) {
                kvm_make_request(KVM_REQ_EVENT, vcpu);
-               kvm_x86_ops.request_immediate_exit(vcpu);
+               static_call(kvm_x86_request_immediate_exit)(vcpu);
        }
 
-       trace_kvm_entry(vcpu);
-
        fpregs_assert_state_consistent();
        if (test_thread_flag(TIF_NEED_FPU_LOAD))
                switch_fpu_return();
@@ -9004,7 +9069,19 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
        }
 
-       exit_fastpath = kvm_x86_ops.run(vcpu);
+       for (;;) {
+               exit_fastpath = static_call(kvm_x86_run)(vcpu);
+               if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
+                       break;
+
+                if (unlikely(kvm_vcpu_exit_request(vcpu))) {
+                       exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED;
+                       break;
+               }
+
+               if (vcpu->arch.apicv_active)
+                       static_call(kvm_x86_sync_pir_to_irr)(vcpu);
+        }
 
        /*
         * Do this here before restoring debug registers on the host.  And
@@ -9014,7 +9091,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
         */
        if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
                WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
-               kvm_x86_ops.sync_dirty_debug_regs(vcpu);
+               static_call(kvm_x86_sync_dirty_debug_regs)(vcpu);
                kvm_update_dr0123(vcpu);
                kvm_update_dr7(vcpu);
                vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
@@ -9036,7 +9113,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
        vcpu->mode = OUTSIDE_GUEST_MODE;
        smp_wmb();
 
-       kvm_x86_ops.handle_exit_irqoff(vcpu);
+       static_call(kvm_x86_handle_exit_irqoff)(vcpu);
 
        /*
         * Consume any pending interrupts, including the possible source of
@@ -9078,13 +9155,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
        if (vcpu->arch.apic_attention)
                kvm_lapic_sync_from_vapic(vcpu);
 
-       r = kvm_x86_ops.handle_exit(vcpu, exit_fastpath);
+       r = static_call(kvm_x86_handle_exit)(vcpu, exit_fastpath);
        return r;
 
 cancel_injection:
        if (req_immediate_exit)
                kvm_make_request(KVM_REQ_EVENT, vcpu);
-       kvm_x86_ops.cancel_injection(vcpu);
+       static_call(kvm_x86_cancel_injection)(vcpu);
        if (unlikely(vcpu->arch.apic_attention))
                kvm_lapic_sync_from_vapic(vcpu);
 out:
@@ -9094,13 +9171,13 @@ out:
 static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
 {
        if (!kvm_arch_vcpu_runnable(vcpu) &&
-           (!kvm_x86_ops.pre_block || kvm_x86_ops.pre_block(vcpu) == 0)) {
+           (!kvm_x86_ops.pre_block || static_call(kvm_x86_pre_block)(vcpu) == 0)) {
                srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
                kvm_vcpu_block(vcpu);
                vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
 
                if (kvm_x86_ops.post_block)
-                       kvm_x86_ops.post_block(vcpu);
+                       static_call(kvm_x86_post_block)(vcpu);
 
                if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
                        return 1;
@@ -9321,6 +9398,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
 
        vcpu_load(vcpu);
        kvm_sigset_activate(vcpu);
+       kvm_run->flags = 0;
        kvm_load_guest_fpu(vcpu);
 
        if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
@@ -9495,10 +9573,10 @@ static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
        kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
        kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
 
-       kvm_x86_ops.get_idt(vcpu, &dt);
+       static_call(kvm_x86_get_idt)(vcpu, &dt);
        sregs->idt.limit = dt.size;
        sregs->idt.base = dt.address;
-       kvm_x86_ops.get_gdt(vcpu, &dt);
+       static_call(kvm_x86_get_gdt)(vcpu, &dt);
        sregs->gdt.limit = dt.size;
        sregs->gdt.base = dt.address;
 
@@ -9616,6 +9694,8 @@ static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
                 */
                if (!(sregs->cr4 & X86_CR4_PAE) || !(sregs->efer & EFER_LMA))
                        return false;
+               if (kvm_vcpu_is_illegal_gpa(vcpu, sregs->cr3))
+                       return false;
        } else {
                /*
                 * Not in 64-bit mode: EFER.LMA is clear and the code
@@ -9649,10 +9729,10 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
 
        dt.size = sregs->idt.limit;
        dt.address = sregs->idt.base;
-       kvm_x86_ops.set_idt(vcpu, &dt);
+       static_call(kvm_x86_set_idt)(vcpu, &dt);
        dt.size = sregs->gdt.limit;
        dt.address = sregs->gdt.base;
-       kvm_x86_ops.set_gdt(vcpu, &dt);
+       static_call(kvm_x86_set_gdt)(vcpu, &dt);
 
        vcpu->arch.cr2 = sregs->cr2;
        mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
@@ -9662,14 +9742,14 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
        kvm_set_cr8(vcpu, sregs->cr8);
 
        mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
-       kvm_x86_ops.set_efer(vcpu, sregs->efer);
+       static_call(kvm_x86_set_efer)(vcpu, sregs->efer);
 
        mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
-       kvm_x86_ops.set_cr0(vcpu, sregs->cr0);
+       static_call(kvm_x86_set_cr0)(vcpu, sregs->cr0);
        vcpu->arch.cr0 = sregs->cr0;
 
        mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
-       kvm_x86_ops.set_cr4(vcpu, sregs->cr4);
+       static_call(kvm_x86_set_cr4)(vcpu, sregs->cr4);
 
        idx = srcu_read_lock(&vcpu->kvm->srcu);
        if (is_pae_paging(vcpu)) {
@@ -9777,7 +9857,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
         */
        kvm_set_rflags(vcpu, rflags);
 
-       kvm_x86_ops.update_exception_bitmap(vcpu);
+       static_call(kvm_x86_update_exception_bitmap)(vcpu);
 
        r = 0;
 
@@ -9955,7 +10035,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
                if (kvm_apicv_activated(vcpu->kvm))
                        vcpu->arch.apicv_active = true;
        } else
-               static_key_slow_inc(&kvm_no_apic_vcpu);
+               static_branch_inc(&kvm_has_noapic_vcpu);
 
        r = -ENOMEM;
 
@@ -9993,6 +10073,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
        fx_init(vcpu);
 
        vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
+       vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
 
        vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
 
@@ -10002,9 +10083,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
        vcpu->arch.pending_external_vector = -1;
        vcpu->arch.preempted_in_kernel = false;
 
-       kvm_hv_vcpu_init(vcpu);
-
-       r = kvm_x86_ops.vcpu_create(vcpu);
+       r = static_call(kvm_x86_vcpu_create)(vcpu);
        if (r)
                goto free_guest_fpu;
 
@@ -10040,8 +10119,6 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 {
        struct kvm *kvm = vcpu->kvm;
 
-       kvm_hv_vcpu_postcreate(vcpu);
-
        if (mutex_lock_killable(&vcpu->mutex))
                return;
        vcpu_load(vcpu);
@@ -10067,7 +10144,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 
        kvmclock_reset(vcpu);
 
-       kvm_x86_ops.vcpu_free(vcpu);
+       static_call(kvm_x86_vcpu_free)(vcpu);
 
        kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
        free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
@@ -10084,7 +10161,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
        free_page((unsigned long)vcpu->arch.pio_data);
        kvfree(vcpu->arch.cpuid_entries);
        if (!lapic_in_kernel(vcpu))
-               static_key_slow_dec(&kvm_no_apic_vcpu);
+               static_branch_dec(&kvm_has_noapic_vcpu);
 }
 
 void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -10103,7 +10180,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 
        memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
        kvm_update_dr0123(vcpu);
-       vcpu->arch.dr6 = DR6_INIT;
+       vcpu->arch.dr6 = DR6_ACTIVE_LOW;
        vcpu->arch.dr7 = DR7_FIXED_1;
        kvm_update_dr7(vcpu);
 
@@ -10156,7 +10233,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 
        vcpu->arch.ia32_xss = 0;
 
-       kvm_x86_ops.vcpu_reset(vcpu, init_event);
+       static_call(kvm_x86_vcpu_reset)(vcpu, init_event);
 }
 
 void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
@@ -10182,7 +10259,7 @@ int kvm_arch_hardware_enable(void)
        bool stable, backwards_tsc = false;
 
        kvm_user_return_msr_cpu_online();
-       ret = kvm_x86_ops.hardware_enable();
+       ret = static_call(kvm_x86_hardware_enable)();
        if (ret != 0)
                return ret;
 
@@ -10264,7 +10341,7 @@ int kvm_arch_hardware_enable(void)
 
 void kvm_arch_hardware_disable(void)
 {
-       kvm_x86_ops.hardware_disable();
+       static_call(kvm_x86_hardware_disable)();
        drop_user_return_notifiers();
 }
 
@@ -10283,6 +10360,7 @@ int kvm_arch_hardware_setup(void *opaque)
                return r;
 
        memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
+       kvm_ops_static_call_update();
 
        if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
                supported_xss = 0;
@@ -10311,7 +10389,7 @@ int kvm_arch_hardware_setup(void *opaque)
 
 void kvm_arch_hardware_unsetup(void)
 {
-       kvm_x86_ops.hardware_unsetup();
+       static_call(kvm_x86_hardware_unsetup)();
 }
 
 int kvm_arch_check_processor_compat(void *opaque)
@@ -10339,8 +10417,8 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
        return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0;
 }
 
-struct static_key kvm_no_apic_vcpu __read_mostly;
-EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu);
+__read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu);
+EXPORT_SYMBOL_GPL(kvm_has_noapic_vcpu);
 
 void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
 {
@@ -10351,12 +10429,12 @@ void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
                pmu->need_cleanup = true;
                kvm_make_request(KVM_REQ_PMU, vcpu);
        }
-       kvm_x86_ops.sched_in(vcpu, cpu);
+       static_call(kvm_x86_sched_in)(vcpu, cpu);
 }
 
 void kvm_arch_free_vm(struct kvm *kvm)
 {
-       kfree(kvm->arch.hyperv.hv_pa_pg);
+       kfree(to_kvm_hv(kvm)->hv_pa_pg);
        vfree(kvm);
 }
 
@@ -10395,7 +10473,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
        kvm_page_track_init(kvm);
        kvm_mmu_init_vm(kvm);
 
-       return kvm_x86_ops.vm_init(kvm);
+       return static_call(kvm_x86_vm_init)(kvm);
 }
 
 int kvm_arch_post_init_vm(struct kvm *kvm)
@@ -10494,7 +10572,7 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
                        return 0;
 
                old_npages = slot->npages;
-               hva = 0;
+               hva = slot->userspace_addr;
        }
 
        for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
@@ -10540,8 +10618,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
                __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0);
                mutex_unlock(&kvm->slots_lock);
        }
-       if (kvm_x86_ops.vm_destroy)
-               kvm_x86_ops.vm_destroy(kvm);
+       static_call_cond(kvm_x86_vm_destroy)(kvm);
        for (i = 0; i < kvm->arch.msr_filter.count; i++)
                kfree(kvm->arch.msr_filter.ranges[i].bitmap);
        kvm_pic_destroy(kvm);
@@ -10551,6 +10628,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
        kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1));
        kvm_mmu_uninit_vm(kvm);
        kvm_page_track_cleanup(kvm);
+       kvm_xen_destroy_vm(kvm);
        kvm_hv_destroy_vm(kvm);
 }
 
@@ -10732,7 +10810,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
         */
        if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
                if (kvm_x86_ops.slot_enable_log_dirty) {
-                       kvm_x86_ops.slot_enable_log_dirty(kvm, new);
+                       static_call(kvm_x86_slot_enable_log_dirty)(kvm, new);
                } else {
                        int level =
                                kvm_dirty_log_manual_protect_and_init_set(kvm) ?
@@ -10749,8 +10827,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
                        kvm_mmu_slot_remove_write_access(kvm, new, level);
                }
        } else {
-               if (kvm_x86_ops.slot_disable_log_dirty)
-                       kvm_x86_ops.slot_disable_log_dirty(kvm, new);
+               static_call_cond(kvm_x86_slot_disable_log_dirty)(kvm, new);
        }
 }
 
@@ -10789,7 +10866,7 @@ static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
 {
        return (is_guest_mode(vcpu) &&
                        kvm_x86_ops.guest_apic_has_interrupt &&
-                       kvm_x86_ops.guest_apic_has_interrupt(vcpu));
+                       static_call(kvm_x86_guest_apic_has_interrupt)(vcpu));
 }
 
 static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
@@ -10808,12 +10885,12 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
 
        if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
            (vcpu->arch.nmi_pending &&
-            kvm_x86_ops.nmi_allowed(vcpu, false)))
+            static_call(kvm_x86_nmi_allowed)(vcpu, false)))
                return true;
 
        if (kvm_test_request(KVM_REQ_SMI, vcpu) ||
            (vcpu->arch.smi_pending &&
-            kvm_x86_ops.smi_allowed(vcpu, false)))
+            static_call(kvm_x86_smi_allowed)(vcpu, false)))
                return true;
 
        if (kvm_arch_interrupt_allowed(vcpu) &&
@@ -10847,7 +10924,7 @@ bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
                 kvm_test_request(KVM_REQ_EVENT, vcpu))
                return true;
 
-       if (vcpu->arch.apicv_active && kvm_x86_ops.dy_apicv_has_pending_interrupt(vcpu))
+       if (vcpu->arch.apicv_active && static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu))
                return true;
 
        return false;
@@ -10865,7 +10942,7 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
 
 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
 {
-       return kvm_x86_ops.interrupt_allowed(vcpu, false);
+       return static_call(kvm_x86_interrupt_allowed)(vcpu, false);
 }
 
 unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu)
@@ -10891,7 +10968,7 @@ unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
 {
        unsigned long rflags;
 
-       rflags = kvm_x86_ops.get_rflags(vcpu);
+       rflags = static_call(kvm_x86_get_rflags)(vcpu);
        if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
                rflags &= ~X86_EFLAGS_TF;
        return rflags;
@@ -10903,7 +10980,7 @@ static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
        if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
            kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
                rflags |= X86_EFLAGS_TF;
-       kvm_x86_ops.set_rflags(vcpu, rflags);
+       static_call(kvm_x86_set_rflags)(vcpu, rflags);
 }
 
 void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
@@ -11033,7 +11110,7 @@ static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu)
                return false;
 
        if (!kvm_pv_async_pf_enabled(vcpu) ||
-           (vcpu->arch.apf.send_user_only && kvm_x86_ops.get_cpl(vcpu) == 0))
+           (vcpu->arch.apf.send_user_only && static_call(kvm_x86_get_cpl)(vcpu) == 0))
                return false;
 
        return true;
@@ -11178,7 +11255,7 @@ int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
 
        irqfd->producer = prod;
        kvm_arch_start_assignment(irqfd->kvm);
-       ret = kvm_x86_ops.update_pi_irte(irqfd->kvm,
+       ret = static_call(kvm_x86_update_pi_irte)(irqfd->kvm,
                                         prod->irq, irqfd->gsi, 1);
 
        if (ret)
@@ -11203,7 +11280,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
         * when the irq is masked/disabled or the consumer side (KVM
         * int this case doesn't want to receive the interrupts.
        */
-       ret = kvm_x86_ops.update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0);
+       ret = static_call(kvm_x86_update_pi_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0);
        if (ret)
                printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
                       " fails: %d\n", irqfd->consumer.token, ret);
@@ -11214,7 +11291,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
 int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
                                   uint32_t guest_irq, bool set)
 {
-       return kvm_x86_ops.update_pi_irte(kvm, host_irq, guest_irq, set);
+       return static_call(kvm_x86_update_pi_irte)(kvm, host_irq, guest_irq, set);
 }
 
 bool kvm_vector_hashing_enabled(void)
@@ -11556,6 +11633,7 @@ int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size,
 }
 EXPORT_SYMBOL_GPL(kvm_sev_es_string_io);
 
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_entry);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
index c5ee0f5..39eb048 100644 (file)
@@ -98,7 +98,7 @@ static inline bool is_64_bit_mode(struct kvm_vcpu *vcpu)
 
        if (!is_long_mode(vcpu))
                return false;
-       kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+       static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
        return cs_l;
 }
 
@@ -129,7 +129,7 @@ static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
 static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu)
 {
        ++vcpu->stat.tlb_flush;
-       kvm_x86_ops.tlb_flush_current(vcpu);
+       static_call(kvm_x86_tlb_flush_current)(vcpu);
 }
 
 static inline int is_pae(struct kvm_vcpu *vcpu)
@@ -244,9 +244,10 @@ static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk)
 
 static inline bool kvm_vcpu_latch_init(struct kvm_vcpu *vcpu)
 {
-       return is_smm(vcpu) || kvm_x86_ops.apic_init_signal_blocked(vcpu);
+       return is_smm(vcpu) || static_call(kvm_x86_apic_init_signal_blocked)(vcpu);
 }
 
+void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs);
 void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
 
 void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr);
@@ -273,6 +274,8 @@ bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
                                          int page_num);
 bool kvm_vector_hashing_enabled(void);
 void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code);
+int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type,
+                                   void *insn, int insn_len);
 int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
                            int emulation_type, void *insn, int insn_len);
 fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu);
@@ -296,6 +299,8 @@ extern int pi_inject_timer;
 
 extern struct static_key kvm_no_apic_vcpu;
 
+extern bool report_ignored_msrs;
+
 static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
 {
        return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,
@@ -391,7 +396,6 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu);
 void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu);
 int kvm_spec_ctrl_test_value(u64 value);
 bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
-bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu);
 int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
                              struct x86_exception *e);
 int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva);
@@ -425,6 +429,8 @@ bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type);
                __reserved_bits |= X86_CR4_UMIP;        \
        if (!__cpu_has(__c, X86_FEATURE_VMX))           \
                __reserved_bits |= X86_CR4_VMXE;        \
+       if (!__cpu_has(__c, X86_FEATURE_PCID))          \
+               __reserved_bits |= X86_CR4_PCIDE;       \
        __reserved_bits;                                \
 })
 
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
new file mode 100644 (file)
index 0000000..af8f656
--- /dev/null
@@ -0,0 +1,431 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright © 2019 Oracle and/or its affiliates. All rights reserved.
+ * Copyright © 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * KVM Xen emulation
+ */
+
+#include "x86.h"
+#include "xen.h"
+#include "hyperv.h"
+
+#include <linux/kvm_host.h>
+
+#include <trace/events/kvm.h>
+#include <xen/interface/xen.h>
+
+#include "trace.h"
+
+DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled, HZ);
+
+static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
+{
+       gpa_t gpa = gfn_to_gpa(gfn);
+       int wc_ofs, sec_hi_ofs;
+       int ret;
+       int idx = srcu_read_lock(&kvm->srcu);
+
+       ret = kvm_gfn_to_hva_cache_init(kvm, &kvm->arch.xen.shinfo_cache,
+                                       gpa, PAGE_SIZE);
+       if (ret)
+               goto out;
+
+       kvm->arch.xen.shinfo_set = true;
+
+       /* Paranoia checks on the 32-bit struct layout */
+       BUILD_BUG_ON(offsetof(struct compat_shared_info, wc) != 0x900);
+       BUILD_BUG_ON(offsetof(struct compat_shared_info, arch.wc_sec_hi) != 0x924);
+       BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
+
+       /* 32-bit location by default */
+       wc_ofs = offsetof(struct compat_shared_info, wc);
+       sec_hi_ofs = offsetof(struct compat_shared_info, arch.wc_sec_hi);
+
+#ifdef CONFIG_X86_64
+       /* Paranoia checks on the 64-bit struct layout */
+       BUILD_BUG_ON(offsetof(struct shared_info, wc) != 0xc00);
+       BUILD_BUG_ON(offsetof(struct shared_info, wc_sec_hi) != 0xc0c);
+
+       if (kvm->arch.xen.long_mode) {
+               wc_ofs = offsetof(struct shared_info, wc);
+               sec_hi_ofs = offsetof(struct shared_info, wc_sec_hi);
+       }
+#endif
+
+       kvm_write_wall_clock(kvm, gpa + wc_ofs, sec_hi_ofs - wc_ofs);
+       kvm_make_all_cpus_request(kvm, KVM_REQ_MASTERCLOCK_UPDATE);
+
+out:
+       srcu_read_unlock(&kvm->srcu, idx);
+       return ret;
+}
+
+int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
+{
+       u8 rc = 0;
+
+       /*
+        * If the global upcall vector (HVMIRQ_callback_vector) is set and
+        * the vCPU's evtchn_upcall_pending flag is set, the IRQ is pending.
+        */
+       struct gfn_to_hva_cache *ghc = &v->arch.xen.vcpu_info_cache;
+       struct kvm_memslots *slots = kvm_memslots(v->kvm);
+       unsigned int offset = offsetof(struct vcpu_info, evtchn_upcall_pending);
+
+       /* No need for compat handling here */
+       BUILD_BUG_ON(offsetof(struct vcpu_info, evtchn_upcall_pending) !=
+                    offsetof(struct compat_vcpu_info, evtchn_upcall_pending));
+       BUILD_BUG_ON(sizeof(rc) !=
+                    sizeof(((struct vcpu_info *)0)->evtchn_upcall_pending));
+       BUILD_BUG_ON(sizeof(rc) !=
+                    sizeof(((struct compat_vcpu_info *)0)->evtchn_upcall_pending));
+
+       /*
+        * For efficiency, this mirrors the checks for using the valid
+        * cache in kvm_read_guest_offset_cached(), but just uses
+        * __get_user() instead. And falls back to the slow path.
+        */
+       if (likely(slots->generation == ghc->generation &&
+                  !kvm_is_error_hva(ghc->hva) && ghc->memslot)) {
+               /* Fast path */
+               __get_user(rc, (u8 __user *)ghc->hva + offset);
+       } else {
+               /* Slow path */
+               kvm_read_guest_offset_cached(v->kvm, ghc, &rc, offset,
+                                            sizeof(rc));
+       }
+
+       return rc;
+}
+
+int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
+{
+       int r = -ENOENT;
+
+       mutex_lock(&kvm->lock);
+
+       switch (data->type) {
+       case KVM_XEN_ATTR_TYPE_LONG_MODE:
+               if (!IS_ENABLED(CONFIG_64BIT) && data->u.long_mode) {
+                       r = -EINVAL;
+               } else {
+                       kvm->arch.xen.long_mode = !!data->u.long_mode;
+                       r = 0;
+               }
+               break;
+
+       case KVM_XEN_ATTR_TYPE_SHARED_INFO:
+               if (data->u.shared_info.gfn == GPA_INVALID) {
+                       kvm->arch.xen.shinfo_set = false;
+                       r = 0;
+                       break;
+               }
+               r = kvm_xen_shared_info_init(kvm, data->u.shared_info.gfn);
+               break;
+
+
+       case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
+               if (data->u.vector && data->u.vector < 0x10)
+                       r = -EINVAL;
+               else {
+                       kvm->arch.xen.upcall_vector = data->u.vector;
+                       r = 0;
+               }
+               break;
+
+       default:
+               break;
+       }
+
+       mutex_unlock(&kvm->lock);
+       return r;
+}
+
+int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
+{
+       int r = -ENOENT;
+
+       mutex_lock(&kvm->lock);
+
+       switch (data->type) {
+       case KVM_XEN_ATTR_TYPE_LONG_MODE:
+               data->u.long_mode = kvm->arch.xen.long_mode;
+               r = 0;
+               break;
+
+       case KVM_XEN_ATTR_TYPE_SHARED_INFO:
+               if (kvm->arch.xen.shinfo_set)
+                       data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_cache.gpa);
+               else
+                       data->u.shared_info.gfn = GPA_INVALID;
+               r = 0;
+               break;
+
+       case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
+               data->u.vector = kvm->arch.xen.upcall_vector;
+               r = 0;
+               break;
+
+       default:
+               break;
+       }
+
+       mutex_unlock(&kvm->lock);
+       return r;
+}
+
+int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
+{
+       int idx, r = -ENOENT;
+
+       mutex_lock(&vcpu->kvm->lock);
+       idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+       switch (data->type) {
+       case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO:
+               /* No compat necessary here. */
+               BUILD_BUG_ON(sizeof(struct vcpu_info) !=
+                            sizeof(struct compat_vcpu_info));
+
+               if (data->u.gpa == GPA_INVALID) {
+                       vcpu->arch.xen.vcpu_info_set = false;
+                       break;
+               }
+
+               r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
+                                             &vcpu->arch.xen.vcpu_info_cache,
+                                             data->u.gpa,
+                                             sizeof(struct vcpu_info));
+               if (!r) {
+                       vcpu->arch.xen.vcpu_info_set = true;
+                       kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+               }
+               break;
+
+       case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
+               if (data->u.gpa == GPA_INVALID) {
+                       vcpu->arch.xen.vcpu_time_info_set = false;
+                       break;
+               }
+
+               r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
+                                             &vcpu->arch.xen.vcpu_time_info_cache,
+                                             data->u.gpa,
+                                             sizeof(struct pvclock_vcpu_time_info));
+               if (!r) {
+                       vcpu->arch.xen.vcpu_time_info_set = true;
+                       kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+               }
+               break;
+
+       default:
+               break;
+       }
+
+       srcu_read_unlock(&vcpu->kvm->srcu, idx);
+       mutex_unlock(&vcpu->kvm->lock);
+       return r;
+}
+
+int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
+{
+       int r = -ENOENT;
+
+       mutex_lock(&vcpu->kvm->lock);
+
+       switch (data->type) {
+       case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO:
+               if (vcpu->arch.xen.vcpu_info_set)
+                       data->u.gpa = vcpu->arch.xen.vcpu_info_cache.gpa;
+               else
+                       data->u.gpa = GPA_INVALID;
+               r = 0;
+               break;
+
+       case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
+               if (vcpu->arch.xen.vcpu_time_info_set)
+                       data->u.gpa = vcpu->arch.xen.vcpu_time_info_cache.gpa;
+               else
+                       data->u.gpa = GPA_INVALID;
+               r = 0;
+               break;
+
+       default:
+               break;
+       }
+
+       mutex_unlock(&vcpu->kvm->lock);
+       return r;
+}
+
+int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
+{
+       struct kvm *kvm = vcpu->kvm;
+       u32 page_num = data & ~PAGE_MASK;
+       u64 page_addr = data & PAGE_MASK;
+       bool lm = is_long_mode(vcpu);
+
+       /* Latch long_mode for shared_info pages etc. */
+       vcpu->kvm->arch.xen.long_mode = lm;
+
+       /*
+        * If Xen hypercall intercept is enabled, fill the hypercall
+        * page with VMCALL/VMMCALL instructions since that's what
+        * we catch. Else the VMM has provided the hypercall pages
+        * with instructions of its own choosing, so use those.
+        */
+       if (kvm_xen_hypercall_enabled(kvm)) {
+               u8 instructions[32];
+               int i;
+
+               if (page_num)
+                       return 1;
+
+               /* mov imm32, %eax */
+               instructions[0] = 0xb8;
+
+               /* vmcall / vmmcall */
+               kvm_x86_ops.patch_hypercall(vcpu, instructions + 5);
+
+               /* ret */
+               instructions[8] = 0xc3;
+
+               /* int3 to pad */
+               memset(instructions + 9, 0xcc, sizeof(instructions) - 9);
+
+               for (i = 0; i < PAGE_SIZE / sizeof(instructions); i++) {
+                       *(u32 *)&instructions[1] = i;
+                       if (kvm_vcpu_write_guest(vcpu,
+                                                page_addr + (i * sizeof(instructions)),
+                                                instructions, sizeof(instructions)))
+                               return 1;
+               }
+       } else {
+               /*
+                * Note, truncation is a non-issue as 'lm' is guaranteed to be
+                * false for a 32-bit kernel, i.e. when hva_t is only 4 bytes.
+                */
+               hva_t blob_addr = lm ? kvm->arch.xen_hvm_config.blob_addr_64
+                                    : kvm->arch.xen_hvm_config.blob_addr_32;
+               u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
+                                 : kvm->arch.xen_hvm_config.blob_size_32;
+               u8 *page;
+
+               if (page_num >= blob_size)
+                       return 1;
+
+               blob_addr += page_num * PAGE_SIZE;
+
+               page = memdup_user((u8 __user *)blob_addr, PAGE_SIZE);
+               if (IS_ERR(page))
+                       return PTR_ERR(page);
+
+               if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) {
+                       kfree(page);
+                       return 1;
+               }
+       }
+       return 0;
+}
+
+int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
+{
+       if (xhc->flags & ~KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL)
+               return -EINVAL;
+
+       /*
+        * With hypercall interception the kernel generates its own
+        * hypercall page so it must not be provided.
+        */
+       if ((xhc->flags & KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL) &&
+           (xhc->blob_addr_32 || xhc->blob_addr_64 ||
+            xhc->blob_size_32 || xhc->blob_size_64))
+               return -EINVAL;
+
+       mutex_lock(&kvm->lock);
+
+       if (xhc->msr && !kvm->arch.xen_hvm_config.msr)
+               static_branch_inc(&kvm_xen_enabled.key);
+       else if (!xhc->msr && kvm->arch.xen_hvm_config.msr)
+               static_branch_slow_dec_deferred(&kvm_xen_enabled);
+
+       memcpy(&kvm->arch.xen_hvm_config, xhc, sizeof(*xhc));
+
+       mutex_unlock(&kvm->lock);
+       return 0;
+}
+
+void kvm_xen_destroy_vm(struct kvm *kvm)
+{
+       if (kvm->arch.xen_hvm_config.msr)
+               static_branch_slow_dec_deferred(&kvm_xen_enabled);
+}
+
+static int kvm_xen_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
+{
+       kvm_rax_write(vcpu, result);
+       return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int kvm_xen_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
+{
+       struct kvm_run *run = vcpu->run;
+
+       if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.xen.hypercall_rip)))
+               return 1;
+
+       return kvm_xen_hypercall_set_result(vcpu, run->xen.u.hcall.result);
+}
+
+int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
+{
+       bool longmode;
+       u64 input, params[6];
+
+       input = (u64)kvm_register_read(vcpu, VCPU_REGS_RAX);
+
+       /* Hyper-V hypercalls get bit 31 set in EAX */
+       if ((input & 0x80000000) &&
+           kvm_hv_hypercall_enabled(vcpu))
+               return kvm_hv_hypercall(vcpu);
+
+       longmode = is_64_bit_mode(vcpu);
+       if (!longmode) {
+               params[0] = (u32)kvm_rbx_read(vcpu);
+               params[1] = (u32)kvm_rcx_read(vcpu);
+               params[2] = (u32)kvm_rdx_read(vcpu);
+               params[3] = (u32)kvm_rsi_read(vcpu);
+               params[4] = (u32)kvm_rdi_read(vcpu);
+               params[5] = (u32)kvm_rbp_read(vcpu);
+       }
+#ifdef CONFIG_X86_64
+       else {
+               params[0] = (u64)kvm_rdi_read(vcpu);
+               params[1] = (u64)kvm_rsi_read(vcpu);
+               params[2] = (u64)kvm_rdx_read(vcpu);
+               params[3] = (u64)kvm_r10_read(vcpu);
+               params[4] = (u64)kvm_r8_read(vcpu);
+               params[5] = (u64)kvm_r9_read(vcpu);
+       }
+#endif
+       trace_kvm_xen_hypercall(input, params[0], params[1], params[2],
+                               params[3], params[4], params[5]);
+
+       vcpu->run->exit_reason = KVM_EXIT_XEN;
+       vcpu->run->xen.type = KVM_EXIT_XEN_HCALL;
+       vcpu->run->xen.u.hcall.longmode = longmode;
+       vcpu->run->xen.u.hcall.cpl = kvm_x86_ops.get_cpl(vcpu);
+       vcpu->run->xen.u.hcall.input = input;
+       vcpu->run->xen.u.hcall.params[0] = params[0];
+       vcpu->run->xen.u.hcall.params[1] = params[1];
+       vcpu->run->xen.u.hcall.params[2] = params[2];
+       vcpu->run->xen.u.hcall.params[3] = params[3];
+       vcpu->run->xen.u.hcall.params[4] = params[4];
+       vcpu->run->xen.u.hcall.params[5] = params[5];
+       vcpu->arch.xen.hypercall_rip = kvm_get_linear_rip(vcpu);
+       vcpu->arch.complete_userspace_io =
+               kvm_xen_hypercall_complete_userspace;
+
+       return 0;
+}
diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h
new file mode 100644 (file)
index 0000000..b66a921
--- /dev/null
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright © 2019 Oracle and/or its affiliates. All rights reserved.
+ * Copyright © 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * KVM Xen emulation
+ */
+
+#ifndef __ARCH_X86_KVM_XEN_H__
+#define __ARCH_X86_KVM_XEN_H__
+
+#include <linux/jump_label_ratelimit.h>
+
+extern struct static_key_false_deferred kvm_xen_enabled;
+
+int __kvm_xen_has_interrupt(struct kvm_vcpu *vcpu);
+int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data);
+int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data);
+int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
+int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
+int kvm_xen_hypercall(struct kvm_vcpu *vcpu);
+int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data);
+int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc);
+void kvm_xen_destroy_vm(struct kvm *kvm);
+
+static inline bool kvm_xen_hypercall_enabled(struct kvm *kvm)
+{
+       return static_branch_unlikely(&kvm_xen_enabled.key) &&
+               (kvm->arch.xen_hvm_config.flags &
+                KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL);
+}
+
+static inline int kvm_xen_has_interrupt(struct kvm_vcpu *vcpu)
+{
+       if (static_branch_unlikely(&kvm_xen_enabled.key) &&
+           vcpu->arch.xen.vcpu_info_set && vcpu->kvm->arch.xen.upcall_vector)
+               return __kvm_xen_has_interrupt(vcpu);
+
+       return 0;
+}
+
+/* 32-bit compatibility definitions, also used natively in 32-bit build */
+#include <asm/pvclock-abi.h>
+#include <asm/xen/interface.h>
+
+struct compat_arch_vcpu_info {
+       unsigned int cr2;
+       unsigned int pad[5];
+};
+
+struct compat_vcpu_info {
+       uint8_t evtchn_upcall_pending;
+       uint8_t evtchn_upcall_mask;
+       uint16_t pad;
+       uint32_t evtchn_pending_sel;
+       struct compat_arch_vcpu_info arch;
+       struct pvclock_vcpu_time_info time;
+}; /* 64 bytes (x86) */
+
+struct compat_arch_shared_info {
+       unsigned int max_pfn;
+       unsigned int pfn_to_mfn_frame_list_list;
+       unsigned int nmi_reason;
+       unsigned int p2m_cr3;
+       unsigned int p2m_vaddr;
+       unsigned int p2m_generation;
+       uint32_t wc_sec_hi;
+};
+
+struct compat_shared_info {
+       struct compat_vcpu_info vcpu_info[MAX_VIRT_CPUS];
+       uint32_t evtchn_pending[32];
+       uint32_t evtchn_mask[32];
+       struct pvclock_wall_clock wc;
+       struct compat_arch_shared_info arch;
+};
+
+#endif /* __ARCH_X86_KVM_XEN_H__ */
index c79e573..c3d5f02 100644 (file)
@@ -382,6 +382,7 @@ bool sev_active(void)
 {
        return sev_status & MSR_AMD64_SEV_ENABLED;
 }
+EXPORT_SYMBOL_GPL(sev_active);
 
 /* Needs to be called from non-instrumentable code */
 bool noinstr sev_es_active(void)
index 584b0de..41c449e 100644 (file)
@@ -12,8 +12,8 @@
 #define _XTENSA_SPINLOCK_H
 
 #include <asm/barrier.h>
-#include <asm/qrwlock.h>
 #include <asm/qspinlock.h>
+#include <asm/qrwlock.h>
 
 #define smp_mb__after_spinlock()       smp_mb()
 
index 476113e..cb9b4c4 100644 (file)
@@ -128,6 +128,7 @@ static int sev_cmd_buffer_len(int cmd)
        case SEV_CMD_LAUNCH_UPDATE_SECRET:      return sizeof(struct sev_data_launch_secret);
        case SEV_CMD_DOWNLOAD_FIRMWARE:         return sizeof(struct sev_data_download_firmware);
        case SEV_CMD_GET_ID:                    return sizeof(struct sev_data_get_id);
+       case SEV_CMD_ATTESTATION_REPORT:        return sizeof(struct sev_data_attestation_report);
        default:                                return 0;
        }
 
index 60f1a38..b434825 100644 (file)
@@ -1703,7 +1703,7 @@ static int kvmgt_page_track_add(unsigned long handle, u64 gfn)
                return -EINVAL;
        }
 
-       spin_lock(&kvm->mmu_lock);
+       write_lock(&kvm->mmu_lock);
 
        if (kvmgt_gfn_is_write_protected(info, gfn))
                goto out;
@@ -1712,7 +1712,7 @@ static int kvmgt_page_track_add(unsigned long handle, u64 gfn)
        kvmgt_protect_table_add(info, gfn);
 
 out:
-       spin_unlock(&kvm->mmu_lock);
+       write_unlock(&kvm->mmu_lock);
        srcu_read_unlock(&kvm->srcu, idx);
        return 0;
 }
@@ -1737,7 +1737,7 @@ static int kvmgt_page_track_remove(unsigned long handle, u64 gfn)
                return -EINVAL;
        }
 
-       spin_lock(&kvm->mmu_lock);
+       write_lock(&kvm->mmu_lock);
 
        if (!kvmgt_gfn_is_write_protected(info, gfn))
                goto out;
@@ -1746,7 +1746,7 @@ static int kvmgt_page_track_remove(unsigned long handle, u64 gfn)
        kvmgt_protect_table_del(info, gfn);
 
 out:
-       spin_unlock(&kvm->mmu_lock);
+       write_unlock(&kvm->mmu_lock);
        srcu_read_unlock(&kvm->srcu, idx);
        return 0;
 }
@@ -1772,7 +1772,7 @@ static void kvmgt_page_track_flush_slot(struct kvm *kvm,
        struct kvmgt_guest_info *info = container_of(node,
                                        struct kvmgt_guest_info, track_node);
 
-       spin_lock(&kvm->mmu_lock);
+       write_lock(&kvm->mmu_lock);
        for (i = 0; i < slot->npages; i++) {
                gfn = slot->base_gfn + i;
                if (kvmgt_gfn_is_write_protected(info, gfn)) {
@@ -1781,7 +1781,7 @@ static void kvmgt_page_track_flush_slot(struct kvm *kvm,
                        kvmgt_protect_table_del(info, gfn);
                }
        }
-       spin_unlock(&kvm->mmu_lock);
+       write_unlock(&kvm->mmu_lock);
 }
 
 static bool __kvmgt_vgpu_exist(struct intel_vgpu *vgpu, struct kvm *kvm)
index 26d5dcd..b3d27fd 100644 (file)
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -810,11 +810,12 @@ static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
                address = pgoff_address(index, vma);
 
                /*
-                * Note because we provide range to follow_pte it will call
+                * follow_invalidate_pte() will use the range to call
                 * mmu_notifier_invalidate_range_start() on our behalf before
                 * taking any lock.
                 */
-               if (follow_pte(vma->vm_mm, address, &range, &ptep, &pmdp, &ptl))
+               if (follow_invalidate_pte(vma->vm_mm, address, &range, &ptep,
+                                         &pmdp, &ptl))
                        continue;
 
                /*
index 84ce841..7ae0ece 100644 (file)
@@ -15,6 +15,8 @@
 
 #include <asm-generic/qrwlock_types.h>
 
+/* Must be included from asm/spinlock.h after defining arch_spin_is_locked.  */
+
 /*
  * Writer states & reader shift and bias.
  */
@@ -116,15 +118,26 @@ static inline void queued_write_unlock(struct qrwlock *lock)
        smp_store_release(&lock->wlocked, 0);
 }
 
+/**
+ * queued_rwlock_is_contended - check if the lock is contended
+ * @lock : Pointer to queue rwlock structure
+ * Return: 1 if lock contended, 0 otherwise
+ */
+static inline int queued_rwlock_is_contended(struct qrwlock *lock)
+{
+       return arch_spin_is_locked(&lock->wait_lock);
+}
+
 /*
  * Remapping rwlock architecture specific functions to the corresponding
  * queue rwlock functions.
  */
-#define arch_read_lock(l)      queued_read_lock(l)
-#define arch_write_lock(l)     queued_write_lock(l)
-#define arch_read_trylock(l)   queued_read_trylock(l)
-#define arch_write_trylock(l)  queued_write_trylock(l)
-#define arch_read_unlock(l)    queued_read_unlock(l)
-#define arch_write_unlock(l)   queued_write_unlock(l)
+#define arch_read_lock(l)              queued_read_lock(l)
+#define arch_write_lock(l)             queued_write_lock(l)
+#define arch_read_trylock(l)           queued_read_trylock(l)
+#define arch_write_trylock(l)          queued_write_trylock(l)
+#define arch_read_unlock(l)            queued_read_unlock(l)
+#define arch_write_unlock(l)           queued_write_unlock(l)
+#define arch_rwlock_is_contended(l)    queued_rwlock_is_contended(l)
 
 #endif /* __ASM_GENERIC_QRWLOCK_H */
index f3b1013..e126ebd 100644 (file)
@@ -425,9 +425,8 @@ struct kvm_irq_routing_table {
 #define KVM_PRIVATE_MEM_SLOTS 0
 #endif
 
-#ifndef KVM_MEM_SLOTS_NUM
-#define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS)
-#endif
+#define KVM_MEM_SLOTS_NUM SHRT_MAX
+#define KVM_USER_MEM_SLOTS (KVM_MEM_SLOTS_NUM - KVM_PRIVATE_MEM_SLOTS)
 
 #ifndef __KVM_VCPU_MULTIPLE_ADDRESS_SPACE
 static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu)
@@ -451,7 +450,12 @@ struct kvm_memslots {
 };
 
 struct kvm {
+#ifdef KVM_HAVE_MMU_RWLOCK
+       rwlock_t mmu_lock;
+#else
        spinlock_t mmu_lock;
+#endif /* KVM_HAVE_MMU_RWLOCK */
+
        struct mutex slots_lock;
        struct mm_struct *mm; /* userspace tied to this vm */
        struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM];
index ecdf8a8..24b292f 100644 (file)
@@ -1658,9 +1658,11 @@ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
                unsigned long end, unsigned long floor, unsigned long ceiling);
 int
 copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
+int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
+                         struct mmu_notifier_range *range, pte_t **ptepp,
+                         pmd_t **pmdpp, spinlock_t **ptlp);
 int follow_pte(struct mm_struct *mm, unsigned long address,
-               struct mmu_notifier_range *range, pte_t **ptepp, pmd_t **pmdpp,
-               spinlock_t **ptlp);
+              pte_t **ptepp, spinlock_t **ptlp);
 int follow_pfn(struct vm_area_struct *vma, unsigned long address,
        unsigned long *pfn);
 int follow_phys(struct vm_area_struct *vma, unsigned long address,
index 49d155c..b801ead 100644 (file)
@@ -66,6 +66,7 @@ enum sev_cmd {
        SEV_CMD_LAUNCH_MEASURE          = 0x033,
        SEV_CMD_LAUNCH_UPDATE_SECRET    = 0x034,
        SEV_CMD_LAUNCH_FINISH           = 0x035,
+       SEV_CMD_ATTESTATION_REPORT      = 0x036,
 
        /* Guest migration commands (outgoing) */
        SEV_CMD_SEND_START              = 0x040,
@@ -483,6 +484,22 @@ struct sev_data_dbg {
        u32 len;                                /* In */
 } __packed;
 
+/**
+ * struct sev_data_attestation_report - SEV_ATTESTATION_REPORT command parameters
+ *
+ * @handle: handle of the VM
+ * @mnonce: a random nonce that will be included in the report.
+ * @address: physical address where the report will be copied.
+ * @len: length of the physical buffer.
+ */
+struct sev_data_attestation_report {
+       u32 handle;                             /* In */
+       u32 reserved;
+       u64 address;                            /* In */
+       u8 mnonce[16];                          /* In */
+       u32 len;                                /* In/Out */
+} __packed;
+
 #ifdef CONFIG_CRYPTO_DEV_SP_PSP
 
 /**
index 3dcd617..7ce9a51 100644 (file)
@@ -128,4 +128,11 @@ do {                                                               \
        1 : ({ local_irq_restore(flags); 0; }); \
 })
 
+#ifdef arch_rwlock_is_contended
+#define rwlock_is_contended(lock) \
+        arch_rwlock_is_contended(&(lock)->raw_lock)
+#else
+#define rwlock_is_contended(lock)      ((void)(lock), 0)
+#endif /* arch_rwlock_is_contended */
+
 #endif /* __LINUX_RWLOCK_H */
index 6e3a5ee..3052d16 100644 (file)
@@ -1883,12 +1883,24 @@ static inline int _cond_resched(void) { return 0; }
 })
 
 extern int __cond_resched_lock(spinlock_t *lock);
+extern int __cond_resched_rwlock_read(rwlock_t *lock);
+extern int __cond_resched_rwlock_write(rwlock_t *lock);
 
 #define cond_resched_lock(lock) ({                             \
        ___might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);\
        __cond_resched_lock(lock);                              \
 })
 
+#define cond_resched_rwlock_read(lock) ({                      \
+       __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \
+       __cond_resched_rwlock_read(lock);                       \
+})
+
+#define cond_resched_rwlock_write(lock) ({                     \
+       __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \
+       __cond_resched_rwlock_write(lock);                      \
+})
+
 static inline void cond_resched_rcu(void)
 {
 #if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU)
@@ -1912,6 +1924,23 @@ static inline int spin_needbreak(spinlock_t *lock)
 #endif
 }
 
+/*
+ * Check if a rwlock is contended.
+ * Returns non-zero if there is another task waiting on the rwlock.
+ * Returns zero if the lock is not contended or the system / underlying
+ * rwlock implementation does not support contention detection.
+ * Technically does not depend on CONFIG_PREEMPTION, but a general need
+ * for low latency.
+ */
+static inline int rwlock_needbreak(rwlock_t *lock)
+{
+#ifdef CONFIG_PREEMPTION
+       return rwlock_is_contended(lock);
+#else
+       return 0;
+#endif
+}
+
 static __always_inline bool need_resched(void)
 {
        return unlikely(tif_need_resched());
index 374c678..8b281f7 100644 (file)
@@ -216,6 +216,20 @@ struct kvm_hyperv_exit {
        } u;
 };
 
+struct kvm_xen_exit {
+#define KVM_EXIT_XEN_HCALL          1
+       __u32 type;
+       union {
+               struct {
+                       __u32 longmode;
+                       __u32 cpl;
+                       __u64 input;
+                       __u64 result;
+                       __u64 params[6];
+               } hcall;
+       } u;
+};
+
 #define KVM_S390_GET_SKEYS_NONE   1
 #define KVM_S390_SKEYS_MAX        1048576
 
@@ -252,6 +266,8 @@ struct kvm_hyperv_exit {
 #define KVM_EXIT_X86_WRMSR        30
 #define KVM_EXIT_DIRTY_RING_FULL  31
 #define KVM_EXIT_AP_RESET_HOLD    32
+#define KVM_EXIT_X86_BUS_LOCK     33
+#define KVM_EXIT_XEN              34
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 /* Emulate instruction failed. */
@@ -428,6 +444,8 @@ struct kvm_run {
                        __u32 index; /* kernel -> user */
                        __u64 data; /* kernel <-> user */
                } msr;
+               /* KVM_EXIT_XEN */
+               struct kvm_xen_exit xen;
                /* Fix the size of the union. */
                char padding[256];
        };
@@ -1058,6 +1076,8 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190
 #define KVM_CAP_SYS_HYPERV_CPUID 191
 #define KVM_CAP_DIRTY_LOG_RING 192
+#define KVM_CAP_X86_BUS_LOCK_EXIT 193
+#define KVM_CAP_PPC_DAWR1 194
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1131,6 +1151,10 @@ struct kvm_x86_mce {
 #endif
 
 #ifdef KVM_CAP_XEN_HVM
+#define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR       (1 << 0)
+#define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL     (1 << 1)
+#define KVM_XEN_HVM_CONFIG_SHARED_INFO         (1 << 2)
+
 struct kvm_xen_hvm_config {
        __u32 flags;
        __u32 msr;
@@ -1565,6 +1589,45 @@ struct kvm_pv_cmd {
 /* Available with KVM_CAP_DIRTY_LOG_RING */
 #define KVM_RESET_DIRTY_RINGS          _IO(KVMIO, 0xc7)
 
+/* Per-VM Xen attributes */
+#define KVM_XEN_HVM_GET_ATTR   _IOWR(KVMIO, 0xc8, struct kvm_xen_hvm_attr)
+#define KVM_XEN_HVM_SET_ATTR   _IOW(KVMIO,  0xc9, struct kvm_xen_hvm_attr)
+
+struct kvm_xen_hvm_attr {
+       __u16 type;
+       __u16 pad[3];
+       union {
+               __u8 long_mode;
+               __u8 vector;
+               struct {
+                       __u64 gfn;
+               } shared_info;
+               __u64 pad[8];
+       } u;
+};
+
+/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */
+#define KVM_XEN_ATTR_TYPE_LONG_MODE            0x0
+#define KVM_XEN_ATTR_TYPE_SHARED_INFO          0x1
+#define KVM_XEN_ATTR_TYPE_UPCALL_VECTOR                0x2
+
+/* Per-vCPU Xen attributes */
+#define KVM_XEN_VCPU_GET_ATTR  _IOWR(KVMIO, 0xca, struct kvm_xen_vcpu_attr)
+#define KVM_XEN_VCPU_SET_ATTR  _IOW(KVMIO,  0xcb, struct kvm_xen_vcpu_attr)
+
+struct kvm_xen_vcpu_attr {
+       __u16 type;
+       __u16 pad[3];
+       union {
+               __u64 gpa;
+               __u64 pad[8];
+       } u;
+};
+
+/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */
+#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO       0x0
+#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO  0x1
+
 /* Secure Encrypted Virtualization command */
 enum sev_cmd_id {
        /* Guest initialization commands */
@@ -1593,6 +1656,8 @@ enum sev_cmd_id {
        KVM_SEV_DBG_ENCRYPT,
        /* Guest certificates commands */
        KVM_SEV_CERT_EXPORT,
+       /* Attestation report */
+       KVM_SEV_GET_ATTESTATION_REPORT,
 
        KVM_SEV_NR_MAX,
 };
@@ -1645,6 +1710,12 @@ struct kvm_sev_dbg {
        __u32 len;
 };
 
+struct kvm_sev_attestation_report {
+       __u8 mnonce[16];
+       __u64 uaddr;
+       __u32 len;
+};
+
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU    (1 << 0)
 #define KVM_DEV_ASSIGN_PCI_2_3         (1 << 1)
 #define KVM_DEV_ASSIGN_MASK_INTX       (1 << 2)
@@ -1766,4 +1837,7 @@ struct kvm_dirty_gfn {
        __u64 offset;
 };
 
+#define KVM_BUS_LOCK_DETECTION_OFF             (1 << 0)
+#define KVM_BUS_LOCK_DETECTION_EXIT            (1 << 1)
+
 #endif /* __LINUX_KVM_H */
index 8bfb242..5ee37a2 100644 (file)
@@ -598,7 +598,9 @@ struct shared_info {
         * their gettimeofday() syscall on this wallclock-base value.
         */
        struct pvclock_wall_clock wc;
-
+#ifndef CONFIG_X86_32
+       uint32_t wc_sec_hi;
+#endif
        struct arch_shared_info arch;
 
 };
index fe9ca92..4786dd2 100644 (file)
@@ -12,7 +12,6 @@
 #include <linux/percpu.h>
 #include <linux/hardirq.h>
 #include <linux/spinlock.h>
-#include <asm/qrwlock.h>
 
 /**
  * queued_read_lock_slowpath - acquire read lock of a queue rwlock
index 15d2562..ade3576 100644 (file)
@@ -6695,6 +6695,46 @@ int __cond_resched_lock(spinlock_t *lock)
 }
 EXPORT_SYMBOL(__cond_resched_lock);
 
+int __cond_resched_rwlock_read(rwlock_t *lock)
+{
+       int resched = should_resched(PREEMPT_LOCK_OFFSET);
+       int ret = 0;
+
+       lockdep_assert_held_read(lock);
+
+       if (rwlock_needbreak(lock) || resched) {
+               read_unlock(lock);
+               if (resched)
+                       preempt_schedule_common();
+               else
+                       cpu_relax();
+               ret = 1;
+               read_lock(lock);
+       }
+       return ret;
+}
+EXPORT_SYMBOL(__cond_resched_rwlock_read);
+
+int __cond_resched_rwlock_write(rwlock_t *lock)
+{
+       int resched = should_resched(PREEMPT_LOCK_OFFSET);
+       int ret = 0;
+
+       lockdep_assert_held_write(lock);
+
+       if (rwlock_needbreak(lock) || resched) {
+               write_unlock(lock);
+               if (resched)
+                       preempt_schedule_common();
+               else
+                       cpu_relax();
+               ret = 1;
+               write_lock(lock);
+       }
+       return ret;
+}
+EXPORT_SYMBOL(__cond_resched_rwlock_write);
+
 /**
  * yield - yield the current processor to other threads.
  *
index feff48e..985dac0 100644 (file)
@@ -4709,9 +4709,9 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
 }
 #endif /* __PAGETABLE_PMD_FOLDED */
 
-int follow_pte(struct mm_struct *mm, unsigned long address,
-              struct mmu_notifier_range *range, pte_t **ptepp, pmd_t **pmdpp,
-              spinlock_t **ptlp)
+int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
+                         struct mmu_notifier_range *range, pte_t **ptepp,
+                         pmd_t **pmdpp, spinlock_t **ptlp)
 {
        pgd_t *pgd;
        p4d_t *p4d;
@@ -4776,6 +4776,34 @@ out:
        return -EINVAL;
 }
 
+/**
+ * follow_pte - look up PTE at a user virtual address
+ * @mm: the mm_struct of the target address space
+ * @address: user virtual address
+ * @ptepp: location to store found PTE
+ * @ptlp: location to store the lock for the PTE
+ *
+ * On a successful return, the pointer to the PTE is stored in @ptepp;
+ * the corresponding lock is taken and its location is stored in @ptlp.
+ * The contents of the PTE are only stable until @ptlp is released;
+ * any further use, if any, must be protected against invalidation
+ * with MMU notifiers.
+ *
+ * Only IO mappings and raw PFN mappings are allowed.  The mmap semaphore
+ * should be taken for read.
+ *
+ * KVM uses this function.  While it is arguably less bad than ``follow_pfn``,
+ * it is not a good general-purpose API.
+ *
+ * Return: zero on success, -ve otherwise.
+ */
+int follow_pte(struct mm_struct *mm, unsigned long address,
+              pte_t **ptepp, spinlock_t **ptlp)
+{
+       return follow_invalidate_pte(mm, address, NULL, ptepp, NULL, ptlp);
+}
+EXPORT_SYMBOL_GPL(follow_pte);
+
 /**
  * follow_pfn - look up PFN at a user virtual address
  * @vma: memory mapping
@@ -4784,6 +4812,9 @@ out:
  *
  * Only IO mappings and raw PFN mappings are allowed.
  *
+ * This function does not allow the caller to read the permissions
+ * of the PTE.  Do not use it.
+ *
  * Return: zero and the pfn at @pfn on success, -ve otherwise.
  */
 int follow_pfn(struct vm_area_struct *vma, unsigned long address,
@@ -4796,7 +4827,7 @@ int follow_pfn(struct vm_area_struct *vma, unsigned long address,
        if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
                return ret;
 
-       ret = follow_pte(vma->vm_mm, address, NULL, &ptep, NULL, &ptl);
+       ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
        if (ret)
                return ret;
        *pfn = pte_pfn(*ptep);
@@ -4817,7 +4848,7 @@ int follow_phys(struct vm_area_struct *vma,
        if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
                goto out;
 
-       if (follow_pte(vma->vm_mm, address, NULL, &ptep, NULL, &ptl))
+       if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
                goto out;
        pte = *ptep;
 
index c3af3f3..9f18fa0 100644 (file)
@@ -644,6 +644,8 @@ struct kvm_ppc_cpu_char {
 #define KVM_REG_PPC_MMCR3      (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc1)
 #define KVM_REG_PPC_SIER2      (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc2)
 #define KVM_REG_PPC_SIER3      (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc3)
+#define KVM_REG_PPC_DAWR1      (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc4)
+#define KVM_REG_PPC_DAWRX1     (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc5)
 
 /* Transactional Memory checkpointed state:
  * This is all GPRs, all VSX regs and a subset of SPRs
index 374c678..abb89bb 100644 (file)
@@ -1058,6 +1058,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190
 #define KVM_CAP_SYS_HYPERV_CPUID 191
 #define KVM_CAP_DIRTY_LOG_RING 192
+#define KVM_CAP_PPC_DAWR1 194
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
index ce8f4ad..3a84394 100644 (file)
@@ -7,6 +7,7 @@
 /x86_64/cr4_cpuid_sync_test
 /x86_64/debug_regs
 /x86_64/evmcs_test
+/x86_64/get_cpuid_test
 /x86_64/kvm_pv_test
 /x86_64/hyperv_cpuid
 /x86_64/mmio_warning_test
 /x86_64/vmx_preemption_timer_test
 /x86_64/vmx_set_nested_state_test
 /x86_64/vmx_tsc_adjust_test
+/x86_64/xapic_ipi_test
+/x86_64/xen_shinfo_test
+/x86_64/xen_vmcall_test
 /x86_64/xss_msr_test
+/x86_64/vmx_pmu_msrs_test
 /demand_paging_test
 /dirty_log_test
 /dirty_log_perf_test
 /kvm_create_max_vcpus
+/memslot_modification_stress_test
 /set_memory_region_test
 /steal_time
index fe41c6a..8c8eda4 100644 (file)
@@ -40,6 +40,7 @@ LIBKVM_s390x = lib/s390x/processor.c lib/s390x/ucall.c lib/s390x/diag318_test_ha
 
 TEST_GEN_PROGS_x86_64 = x86_64/cr4_cpuid_sync_test
 TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test
+TEST_GEN_PROGS_x86_64 += x86_64/get_cpuid_test
 TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid
 TEST_GEN_PROGS_x86_64 += x86_64/kvm_pv_test
 TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test
@@ -56,13 +57,18 @@ TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test
+TEST_GEN_PROGS_x86_64 += x86_64/xapic_ipi_test
 TEST_GEN_PROGS_x86_64 += x86_64/xss_msr_test
 TEST_GEN_PROGS_x86_64 += x86_64/debug_regs
 TEST_GEN_PROGS_x86_64 += x86_64/tsc_msrs_test
+TEST_GEN_PROGS_x86_64 += x86_64/vmx_pmu_msrs_test
+TEST_GEN_PROGS_x86_64 += x86_64/xen_shinfo_test
+TEST_GEN_PROGS_x86_64 += x86_64/xen_vmcall_test
 TEST_GEN_PROGS_x86_64 += demand_paging_test
 TEST_GEN_PROGS_x86_64 += dirty_log_test
 TEST_GEN_PROGS_x86_64 += dirty_log_perf_test
 TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus
+TEST_GEN_PROGS_x86_64 += memslot_modification_stress_test
 TEST_GEN_PROGS_x86_64 += set_memory_region_test
 TEST_GEN_PROGS_x86_64 += steal_time
 
index cdad1ec..5f7a229 100644 (file)
@@ -64,7 +64,7 @@ static void *vcpu_worker(void *data)
                            exit_reason_str(run->exit_reason));
        }
 
-       ts_diff = timespec_diff_now(start);
+       ts_diff = timespec_elapsed(start);
        PER_VCPU_DEBUG("vCPU %d execution time: %ld.%.9lds\n", vcpu_id,
                       ts_diff.tv_sec, ts_diff.tv_nsec);
 
@@ -95,7 +95,7 @@ static int handle_uffd_page_request(int uffd, uint64_t addr)
                return r;
        }
 
-       ts_diff = timespec_diff_now(start);
+       ts_diff = timespec_elapsed(start);
 
        PER_PAGE_DEBUG("UFFDIO_COPY %d \t%ld ns\n", tid,
                       timespec_to_ns(ts_diff));
@@ -190,7 +190,7 @@ static void *uffd_handler_thread_fn(void *arg)
                pages++;
        }
 
-       ts_diff = timespec_diff_now(start);
+       ts_diff = timespec_elapsed(start);
        PER_VCPU_DEBUG("userfaulted %ld pages over %ld.%.9lds. (%f/sec)\n",
                       pages, ts_diff.tv_sec, ts_diff.tv_nsec,
                       pages / ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / 100000000.0));
@@ -250,6 +250,7 @@ static int setup_demand_paging(struct kvm_vm *vm,
 struct test_params {
        bool use_uffd;
        useconds_t uffd_delay;
+       bool partition_vcpu_memory_access;
 };
 
 static void run_test(enum vm_guest_mode mode, void *arg)
@@ -265,7 +266,8 @@ static void run_test(enum vm_guest_mode mode, void *arg)
        int vcpu_id;
        int r;
 
-       vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size);
+       vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size,
+                                VM_MEM_SRC_ANONYMOUS);
 
        perf_test_args.wr_fract = 1;
 
@@ -277,7 +279,8 @@ static void run_test(enum vm_guest_mode mode, void *arg)
        vcpu_threads = malloc(nr_vcpus * sizeof(*vcpu_threads));
        TEST_ASSERT(vcpu_threads, "Memory allocation failed");
 
-       perf_test_setup_vcpus(vm, nr_vcpus, guest_percpu_mem_size);
+       perf_test_setup_vcpus(vm, nr_vcpus, guest_percpu_mem_size,
+                             p->partition_vcpu_memory_access);
 
        if (p->use_uffd) {
                uffd_handler_threads =
@@ -293,10 +296,19 @@ static void run_test(enum vm_guest_mode mode, void *arg)
                for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
                        vm_paddr_t vcpu_gpa;
                        void *vcpu_hva;
+                       uint64_t vcpu_mem_size;
 
-                       vcpu_gpa = guest_test_phys_mem + (vcpu_id * guest_percpu_mem_size);
+
+                       if (p->partition_vcpu_memory_access) {
+                               vcpu_gpa = guest_test_phys_mem +
+                                          (vcpu_id * guest_percpu_mem_size);
+                               vcpu_mem_size = guest_percpu_mem_size;
+                       } else {
+                               vcpu_gpa = guest_test_phys_mem;
+                               vcpu_mem_size = guest_percpu_mem_size * nr_vcpus;
+                       }
                        PER_VCPU_DEBUG("Added VCPU %d with test mem gpa [%lx, %lx)\n",
-                                      vcpu_id, vcpu_gpa, vcpu_gpa + guest_percpu_mem_size);
+                                      vcpu_id, vcpu_gpa, vcpu_gpa + vcpu_mem_size);
 
                        /* Cache the HVA pointer of the region */
                        vcpu_hva = addr_gpa2hva(vm, vcpu_gpa);
@@ -313,7 +325,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
                                                &uffd_handler_threads[vcpu_id],
                                                pipefds[vcpu_id * 2],
                                                p->uffd_delay, &uffd_args[vcpu_id],
-                                               vcpu_hva, guest_percpu_mem_size);
+                                               vcpu_hva, vcpu_mem_size);
                        if (r < 0)
                                exit(-r);
                }
@@ -339,7 +351,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
                PER_VCPU_DEBUG("Joined thread for vCPU %d\n", vcpu_id);
        }
 
-       ts_diff = timespec_diff_now(start);
+       ts_diff = timespec_elapsed(start);
 
        pr_info("All vCPU threads joined\n");
 
@@ -376,7 +388,7 @@ static void help(char *name)
 {
        puts("");
        printf("usage: %s [-h] [-m mode] [-u] [-d uffd_delay_usec]\n"
-              "          [-b memory] [-v vcpus]\n", name);
+              "          [-b memory] [-v vcpus] [-o]\n", name);
        guest_modes_help();
        printf(" -u: use User Fault FD to handle vCPU page\n"
               "     faults.\n");
@@ -387,6 +399,8 @@ static void help(char *name)
               "     demand paged by each vCPU. e.g. 10M or 3G.\n"
               "     Default: 1G\n");
        printf(" -v: specify the number of vCPUs to run.\n");
+       printf(" -o: Overlap guest memory accesses instead of partitioning\n"
+              "     them into a separate region of memory for each vCPU.\n");
        puts("");
        exit(0);
 }
@@ -394,12 +408,14 @@ static void help(char *name)
 int main(int argc, char *argv[])
 {
        int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS);
-       struct test_params p = {};
+       struct test_params p = {
+               .partition_vcpu_memory_access = true,
+       };
        int opt;
 
        guest_modes_append_default();
 
-       while ((opt = getopt(argc, argv, "hm:ud:b:v:")) != -1) {
+       while ((opt = getopt(argc, argv, "hm:ud:b:v:o")) != -1) {
                switch (opt) {
                case 'm':
                        guest_modes_cmdline(optarg);
@@ -419,6 +435,9 @@ int main(int argc, char *argv[])
                        TEST_ASSERT(nr_vcpus > 0 && nr_vcpus <= max_vcpus,
                                    "Invalid number of vcpus, must be between 1 and %d", max_vcpus);
                        break;
+               case 'o':
+                       p.partition_vcpu_memory_access = false;
+                       break;
                case 'h':
                default:
                        help(argv[0]);
index 2283a0e..04a2641 100644 (file)
@@ -28,8 +28,8 @@ static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE;
 /* Host variables */
 static u64 dirty_log_manual_caps;
 static bool host_quit;
-static uint64_t iteration;
-static uint64_t vcpu_last_completed_iteration[KVM_MAX_VCPUS];
+static int iteration;
+static int vcpu_last_completed_iteration[KVM_MAX_VCPUS];
 
 static void *vcpu_worker(void *data)
 {
@@ -48,11 +48,11 @@ static void *vcpu_worker(void *data)
        run = vcpu_state(vm, vcpu_id);
 
        while (!READ_ONCE(host_quit)) {
-               uint64_t current_iteration = READ_ONCE(iteration);
+               int current_iteration = READ_ONCE(iteration);
 
                clock_gettime(CLOCK_MONOTONIC, &start);
                ret = _vcpu_run(vm, vcpu_id);
-               ts_diff = timespec_diff_now(start);
+               ts_diff = timespec_elapsed(start);
 
                TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret);
                TEST_ASSERT(get_ucall(vm, vcpu_id, NULL) == UCALL_SYNC,
@@ -61,17 +61,17 @@ static void *vcpu_worker(void *data)
 
                pr_debug("Got sync event from vCPU %d\n", vcpu_id);
                vcpu_last_completed_iteration[vcpu_id] = current_iteration;
-               pr_debug("vCPU %d updated last completed iteration to %lu\n",
+               pr_debug("vCPU %d updated last completed iteration to %d\n",
                         vcpu_id, vcpu_last_completed_iteration[vcpu_id]);
 
                if (current_iteration) {
                        pages_count += vcpu_args->pages;
                        total = timespec_add(total, ts_diff);
-                       pr_debug("vCPU %d iteration %lu dirty memory time: %ld.%.9lds\n",
+                       pr_debug("vCPU %d iteration %d dirty memory time: %ld.%.9lds\n",
                                vcpu_id, current_iteration, ts_diff.tv_sec,
                                ts_diff.tv_nsec);
                } else {
-                       pr_debug("vCPU %d iteration %lu populate memory time: %ld.%.9lds\n",
+                       pr_debug("vCPU %d iteration %d populate memory time: %ld.%.9lds\n",
                                vcpu_id, current_iteration, ts_diff.tv_sec,
                                ts_diff.tv_nsec);
                }
@@ -81,7 +81,7 @@ static void *vcpu_worker(void *data)
        }
 
        avg = timespec_div(total, vcpu_last_completed_iteration[vcpu_id]);
-       pr_debug("\nvCPU %d dirtied 0x%lx pages over %lu iterations in %ld.%.9lds. (Avg %ld.%.9lds/iteration)\n",
+       pr_debug("\nvCPU %d dirtied 0x%lx pages over %d iterations in %ld.%.9lds. (Avg %ld.%.9lds/iteration)\n",
                vcpu_id, pages_count, vcpu_last_completed_iteration[vcpu_id],
                total.tv_sec, total.tv_nsec, avg.tv_sec, avg.tv_nsec);
 
@@ -92,6 +92,8 @@ struct test_params {
        unsigned long iterations;
        uint64_t phys_offset;
        int wr_fract;
+       bool partition_vcpu_memory_access;
+       enum vm_mem_backing_src_type backing_src;
 };
 
 static void run_test(enum vm_guest_mode mode, void *arg)
@@ -111,7 +113,8 @@ static void run_test(enum vm_guest_mode mode, void *arg)
        struct kvm_enable_cap cap = {};
        struct timespec clear_dirty_log_total = (struct timespec){0};
 
-       vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size);
+       vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size,
+                                p->backing_src);
 
        perf_test_args.wr_fract = p->wr_fract;
 
@@ -129,7 +132,8 @@ static void run_test(enum vm_guest_mode mode, void *arg)
        vcpu_threads = malloc(nr_vcpus * sizeof(*vcpu_threads));
        TEST_ASSERT(vcpu_threads, "Memory allocation failed");
 
-       perf_test_setup_vcpus(vm, nr_vcpus, guest_percpu_mem_size);
+       perf_test_setup_vcpus(vm, nr_vcpus, guest_percpu_mem_size,
+                             p->partition_vcpu_memory_access);
 
        sync_global_to_guest(vm, perf_test_args);
 
@@ -139,17 +143,21 @@ static void run_test(enum vm_guest_mode mode, void *arg)
 
        clock_gettime(CLOCK_MONOTONIC, &start);
        for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
+               vcpu_last_completed_iteration[vcpu_id] = -1;
+
                pthread_create(&vcpu_threads[vcpu_id], NULL, vcpu_worker,
                               &perf_test_args.vcpu_args[vcpu_id]);
        }
 
-       /* Allow the vCPU to populate memory */
-       pr_debug("Starting iteration %lu - Populating\n", iteration);
-       while (READ_ONCE(vcpu_last_completed_iteration[vcpu_id]) != iteration)
-               pr_debug("Waiting for vcpu_last_completed_iteration == %lu\n",
-                       iteration);
+       /* Allow the vCPUs to populate memory */
+       pr_debug("Starting iteration %d - Populating\n", iteration);
+       for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
+               while (READ_ONCE(vcpu_last_completed_iteration[vcpu_id]) !=
+                      iteration)
+                       ;
+       }
 
-       ts_diff = timespec_diff_now(start);
+       ts_diff = timespec_elapsed(start);
        pr_info("Populate memory time: %ld.%.9lds\n",
                ts_diff.tv_sec, ts_diff.tv_nsec);
 
@@ -157,7 +165,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
        clock_gettime(CLOCK_MONOTONIC, &start);
        vm_mem_region_set_flags(vm, PERF_TEST_MEM_SLOT_INDEX,
                                KVM_MEM_LOG_DIRTY_PAGES);
-       ts_diff = timespec_diff_now(start);
+       ts_diff = timespec_elapsed(start);
        pr_info("Enabling dirty logging time: %ld.%.9lds\n\n",
                ts_diff.tv_sec, ts_diff.tv_nsec);
 
@@ -169,25 +177,25 @@ static void run_test(enum vm_guest_mode mode, void *arg)
                clock_gettime(CLOCK_MONOTONIC, &start);
                iteration++;
 
-               pr_debug("Starting iteration %lu\n", iteration);
+               pr_debug("Starting iteration %d\n", iteration);
                for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
-                       while (READ_ONCE(vcpu_last_completed_iteration[vcpu_id]) != iteration)
-                               pr_debug("Waiting for vCPU %d vcpu_last_completed_iteration == %lu\n",
-                                        vcpu_id, iteration);
+                       while (READ_ONCE(vcpu_last_completed_iteration[vcpu_id])
+                              != iteration)
+                               ;
                }
 
-               ts_diff = timespec_diff_now(start);
+               ts_diff = timespec_elapsed(start);
                vcpu_dirty_total = timespec_add(vcpu_dirty_total, ts_diff);
-               pr_info("Iteration %lu dirty memory time: %ld.%.9lds\n",
+               pr_info("Iteration %d dirty memory time: %ld.%.9lds\n",
                        iteration, ts_diff.tv_sec, ts_diff.tv_nsec);
 
                clock_gettime(CLOCK_MONOTONIC, &start);
                kvm_vm_get_dirty_log(vm, PERF_TEST_MEM_SLOT_INDEX, bmap);
 
-               ts_diff = timespec_diff_now(start);
+               ts_diff = timespec_elapsed(start);
                get_dirty_log_total = timespec_add(get_dirty_log_total,
                                                   ts_diff);
-               pr_info("Iteration %lu get dirty log time: %ld.%.9lds\n",
+               pr_info("Iteration %d get dirty log time: %ld.%.9lds\n",
                        iteration, ts_diff.tv_sec, ts_diff.tv_nsec);
 
                if (dirty_log_manual_caps) {
@@ -195,26 +203,26 @@ static void run_test(enum vm_guest_mode mode, void *arg)
                        kvm_vm_clear_dirty_log(vm, PERF_TEST_MEM_SLOT_INDEX, bmap, 0,
                                               host_num_pages);
 
-                       ts_diff = timespec_diff_now(start);
+                       ts_diff = timespec_elapsed(start);
                        clear_dirty_log_total = timespec_add(clear_dirty_log_total,
                                                             ts_diff);
-                       pr_info("Iteration %lu clear dirty log time: %ld.%.9lds\n",
+                       pr_info("Iteration %d clear dirty log time: %ld.%.9lds\n",
                                iteration, ts_diff.tv_sec, ts_diff.tv_nsec);
                }
        }
 
-       /* Tell the vcpu thread to quit */
-       host_quit = true;
-       for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++)
-               pthread_join(vcpu_threads[vcpu_id], NULL);
-
        /* Disable dirty logging */
        clock_gettime(CLOCK_MONOTONIC, &start);
        vm_mem_region_set_flags(vm, PERF_TEST_MEM_SLOT_INDEX, 0);
-       ts_diff = timespec_diff_now(start);
+       ts_diff = timespec_elapsed(start);
        pr_info("Disabling dirty logging time: %ld.%.9lds\n",
                ts_diff.tv_sec, ts_diff.tv_nsec);
 
+       /* Tell the vcpu thread to quit */
+       host_quit = true;
+       for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++)
+               pthread_join(vcpu_threads[vcpu_id], NULL);
+
        avg = timespec_div(get_dirty_log_total, p->iterations);
        pr_info("Get dirty log over %lu iterations took %ld.%.9lds. (Avg %ld.%.9lds/iteration)\n",
                p->iterations, get_dirty_log_total.tv_sec,
@@ -236,7 +244,7 @@ static void help(char *name)
 {
        puts("");
        printf("usage: %s [-h] [-i iterations] [-p offset] "
-              "[-m mode] [-b vcpu bytes] [-v vcpus]\n", name);
+              "[-m mode] [-b vcpu bytes] [-v vcpus] [-o] [-s mem type]\n", name);
        puts("");
        printf(" -i: specify iteration counts (default: %"PRIu64")\n",
               TEST_HOST_LOOP_N);
@@ -251,6 +259,11 @@ static void help(char *name)
               "     1/<fraction of pages to write>.\n"
               "     (default: 1 i.e. all pages are written to.)\n");
        printf(" -v: specify the number of vCPUs to run.\n");
+       printf(" -o: Overlap guest memory accesses instead of partitioning\n"
+              "     them into a separate region of memory for each vCPU.\n");
+       printf(" -s: specify the type of memory that should be used to\n"
+              "     back the guest data region.\n\n");
+       backing_src_help();
        puts("");
        exit(0);
 }
@@ -261,6 +274,8 @@ int main(int argc, char *argv[])
        struct test_params p = {
                .iterations = TEST_HOST_LOOP_N,
                .wr_fract = 1,
+               .partition_vcpu_memory_access = true,
+               .backing_src = VM_MEM_SRC_ANONYMOUS,
        };
        int opt;
 
@@ -271,10 +286,10 @@ int main(int argc, char *argv[])
 
        guest_modes_append_default();
 
-       while ((opt = getopt(argc, argv, "hi:p:m:b:f:v:")) != -1) {
+       while ((opt = getopt(argc, argv, "hi:p:m:b:f:v:os:")) != -1) {
                switch (opt) {
                case 'i':
-                       p.iterations = strtol(optarg, NULL, 10);
+                       p.iterations = atoi(optarg);
                        break;
                case 'p':
                        p.phys_offset = strtoull(optarg, NULL, 0);
@@ -295,6 +310,11 @@ int main(int argc, char *argv[])
                        TEST_ASSERT(nr_vcpus > 0 && nr_vcpus <= max_vcpus,
                                    "Invalid number of vcpus, must be between 1 and %d", max_vcpus);
                        break;
+               case 'o':
+                       p.partition_vcpu_memory_access = false;
+               case 's':
+                       p.backing_src = parse_backing_src_type(optarg);
+                       break;
                case 'h':
                default:
                        help(argv[0]);
index 5cbb861..2d7eb69 100644 (file)
@@ -79,12 +79,6 @@ struct vm_guest_mode_params {
 };
 extern const struct vm_guest_mode_params vm_guest_mode_params[];
 
-enum vm_mem_backing_src_type {
-       VM_MEM_SRC_ANONYMOUS,
-       VM_MEM_SRC_ANONYMOUS_THP,
-       VM_MEM_SRC_ANONYMOUS_HUGETLB,
-};
-
 int kvm_check_cap(long cap);
 int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap);
 int vcpu_enable_cap(struct kvm_vm *vm, uint32_t vcpu_id,
diff --git a/tools/testing/selftests/kvm/include/numaif.h b/tools/testing/selftests/kvm/include/numaif.h
new file mode 100644 (file)
index 0000000..b020547
--- /dev/null
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * tools/testing/selftests/kvm/include/numaif.h
+ *
+ * Copyright (C) 2020, Google LLC.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ * Header file that provides access to NUMA API functions not explicitly
+ * exported to user space.
+ */
+
+#ifndef SELFTEST_KVM_NUMAIF_H
+#define SELFTEST_KVM_NUMAIF_H
+
+#define __NR_get_mempolicy 239
+#define __NR_migrate_pages 256
+
+/* System calls */
+long get_mempolicy(int *policy, const unsigned long *nmask,
+                  unsigned long maxnode, void *addr, int flags)
+{
+       return syscall(__NR_get_mempolicy, policy, nmask,
+                      maxnode, addr, flags);
+}
+
+long migrate_pages(int pid, unsigned long maxnode,
+                  const unsigned long *frommask,
+                  const unsigned long *tomask)
+{
+       return syscall(__NR_migrate_pages, pid, maxnode, frommask, tomask);
+}
+
+/* Policies */
+#define MPOL_DEFAULT    0
+#define MPOL_PREFERRED  1
+#define MPOL_BIND       2
+#define MPOL_INTERLEAVE         3
+
+#define MPOL_MAX MPOL_INTERLEAVE
+
+/* Flags for get_mem_policy */
+#define MPOL_F_NODE        (1<<0)  /* return next il node or node of address */
+                                   /* Warning: MPOL_F_NODE is unsupported and
+                                    * subject to change. Don't use.
+                                    */
+#define MPOL_F_ADDR        (1<<1)  /* look up vma using address */
+#define MPOL_F_MEMS_ALLOWED (1<<2)  /* query nodes allowed in cpuset */
+
+/* Flags for mbind */
+#define MPOL_MF_STRICT      (1<<0) /* Verify existing pages in the mapping */
+#define MPOL_MF_MOVE        (1<<1) /* Move pages owned by this process to conform to mapping */
+#define MPOL_MF_MOVE_ALL     (1<<2) /* Move every page to conform to mapping */
+
+#endif /* SELFTEST_KVM_NUMAIF_H */
index b118882..005f214 100644 (file)
@@ -44,8 +44,11 @@ extern struct perf_test_args perf_test_args;
 extern uint64_t guest_test_phys_mem;
 
 struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus,
-                               uint64_t vcpu_memory_bytes);
+                                  uint64_t vcpu_memory_bytes,
+                                  enum vm_mem_backing_src_type backing_src);
 void perf_test_destroy_vm(struct kvm_vm *vm);
-void perf_test_setup_vcpus(struct kvm_vm *vm, int vcpus, uint64_t vcpu_memory_bytes);
+void perf_test_setup_vcpus(struct kvm_vm *vm, int vcpus,
+                          uint64_t vcpu_memory_bytes,
+                          bool partition_vcpu_memory_access);
 
 #endif /* SELFTEST_KVM_PERF_TEST_UTIL_H */
index ffffa56..b7f4139 100644 (file)
@@ -64,7 +64,21 @@ int64_t timespec_to_ns(struct timespec ts);
 struct timespec timespec_add_ns(struct timespec ts, int64_t ns);
 struct timespec timespec_add(struct timespec ts1, struct timespec ts2);
 struct timespec timespec_sub(struct timespec ts1, struct timespec ts2);
-struct timespec timespec_diff_now(struct timespec start);
+struct timespec timespec_elapsed(struct timespec start);
 struct timespec timespec_div(struct timespec ts, int divisor);
 
+enum vm_mem_backing_src_type {
+       VM_MEM_SRC_ANONYMOUS,
+       VM_MEM_SRC_ANONYMOUS_THP,
+       VM_MEM_SRC_ANONYMOUS_HUGETLB,
+};
+
+struct vm_mem_backing_src_alias {
+       const char *name;
+       enum vm_mem_backing_src_type type;
+};
+
+void backing_src_help(void);
+enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name);
+
 #endif /* SELFTEST_KVM_TEST_UTIL_H */
index 90cd598..0b30b4e 100644 (file)
@@ -263,6 +263,19 @@ static inline void outl(uint16_t port, uint32_t value)
        __asm__ __volatile__("outl %%eax, %%dx" : : "d"(port), "a"(value));
 }
 
+static inline void cpuid(uint32_t *eax, uint32_t *ebx,
+                        uint32_t *ecx, uint32_t *edx)
+{
+       /* ecx is often an input as well as an output. */
+       asm volatile("cpuid"
+           : "=a" (*eax),
+             "=b" (*ebx),
+             "=c" (*ecx),
+             "=d" (*edx)
+           : "0" (*eax), "2" (*ecx)
+           : "memory");
+}
+
 #define SET_XMM(__var, __xmm) \
        asm volatile("movq %0, %%"#__xmm : : "r"(__var) : #__xmm)
 
@@ -338,8 +351,10 @@ void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid,
                     struct kvm_x86_state *state);
 
 struct kvm_msr_list *kvm_get_msr_index_list(void);
-
+uint64_t kvm_get_feature_msr(uint64_t msr_index);
 struct kvm_cpuid2 *kvm_get_supported_cpuid(void);
+
+struct kvm_cpuid2 *vcpu_get_cpuid(struct kvm_vm *vm, uint32_t vcpuid);
 void vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid,
                    struct kvm_cpuid2 *cpuid);
 
@@ -391,6 +406,10 @@ bool set_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 *ent);
 uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2,
                       uint64_t a3);
 
+struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void);
+void vcpu_set_hv_cpuid(struct kvm_vm *vm, uint32_t vcpuid);
+struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vm *vm, uint32_t vcpuid);
+
 /*
  * Basic CPU control in CR0
  */
@@ -406,8 +425,27 @@ uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2,
 #define X86_CR0_CD          (1UL<<30) /* Cache Disable */
 #define X86_CR0_PG          (1UL<<31) /* Paging */
 
+#define APIC_DEFAULT_GPA               0xfee00000ULL
+
+/* APIC base address MSR and fields */
+#define MSR_IA32_APICBASE              0x0000001b
+#define MSR_IA32_APICBASE_BSP          (1<<8)
+#define MSR_IA32_APICBASE_EXTD         (1<<10)
+#define MSR_IA32_APICBASE_ENABLE       (1<<11)
+#define MSR_IA32_APICBASE_BASE         (0xfffff<<12)
+#define                GET_APIC_BASE(x)        (((x) >> 12) << 12)
+
 #define APIC_BASE_MSR  0x800
 #define X2APIC_ENABLE  (1UL << 10)
+#define        APIC_ID         0x20
+#define        APIC_LVR        0x30
+#define                GET_APIC_ID_FIELD(x)    (((x) >> 24) & 0xFF)
+#define        APIC_TASKPRI    0x80
+#define        APIC_PROCPRI    0xA0
+#define        APIC_EOI        0xB0
+#define        APIC_SPIV       0xF0
+#define                APIC_SPIV_FOCUS_DISABLED        (1 << 9)
+#define                APIC_SPIV_APIC_ENABLED          (1 << 8)
 #define        APIC_ICR        0x300
 #define                APIC_DEST_SELF          0x40000
 #define                APIC_DEST_ALLINC        0x80000
@@ -432,6 +470,7 @@ uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2,
 #define                APIC_DM_EXTINT          0x00700
 #define                APIC_VECTOR_MASK        0x000FF
 #define        APIC_ICR2       0x310
+#define                SET_APIC_DEST_FIELD(x)  ((x) << 24)
 
 /* VMX_EPT_VPID_CAP bits */
 #define VMX_EPT_VPID_CAP_AD_BITS       (1ULL << 21)
index fa5a90e..d787cb8 100644 (file)
@@ -1801,6 +1801,7 @@ static struct exit_reason {
        {KVM_EXIT_DIRTY_RING_FULL, "DIRTY_RING_FULL"},
        {KVM_EXIT_X86_RDMSR, "RDMSR"},
        {KVM_EXIT_X86_WRMSR, "WRMSR"},
+       {KVM_EXIT_XEN, "XEN"},
 #ifdef KVM_EXIT_MEMORY_NOT_PRESENT
        {KVM_EXIT_MEMORY_NOT_PRESENT, "MEMORY_NOT_PRESENT"},
 #endif
index 9be1944..81490b9 100644 (file)
@@ -49,7 +49,8 @@ static void guest_code(uint32_t vcpu_id)
 }
 
 struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus,
-                                  uint64_t vcpu_memory_bytes)
+                                  uint64_t vcpu_memory_bytes,
+                                  enum vm_mem_backing_src_type backing_src)
 {
        struct kvm_vm *vm;
        uint64_t guest_num_pages;
@@ -93,8 +94,7 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus,
        pr_info("guest physical test memory offset: 0x%lx\n", guest_test_phys_mem);
 
        /* Add an extra memory slot for testing */
-       vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
-                                   guest_test_phys_mem,
+       vm_userspace_mem_region_add(vm, backing_src, guest_test_phys_mem,
                                    PERF_TEST_MEM_SLOT_INDEX,
                                    guest_num_pages, 0);
 
@@ -112,7 +112,9 @@ void perf_test_destroy_vm(struct kvm_vm *vm)
        kvm_vm_free(vm);
 }
 
-void perf_test_setup_vcpus(struct kvm_vm *vm, int vcpus, uint64_t vcpu_memory_bytes)
+void perf_test_setup_vcpus(struct kvm_vm *vm, int vcpus,
+                          uint64_t vcpu_memory_bytes,
+                          bool partition_vcpu_memory_access)
 {
        vm_paddr_t vcpu_gpa;
        struct perf_test_vcpu_args *vcpu_args;
@@ -122,13 +124,22 @@ void perf_test_setup_vcpus(struct kvm_vm *vm, int vcpus, uint64_t vcpu_memory_by
                vcpu_args = &perf_test_args.vcpu_args[vcpu_id];
 
                vcpu_args->vcpu_id = vcpu_id;
-               vcpu_args->gva = guest_test_virt_mem +
-                                (vcpu_id * vcpu_memory_bytes);
-               vcpu_args->pages = vcpu_memory_bytes /
-                                  perf_test_args.guest_page_size;
+               if (partition_vcpu_memory_access) {
+                       vcpu_args->gva = guest_test_virt_mem +
+                                        (vcpu_id * vcpu_memory_bytes);
+                       vcpu_args->pages = vcpu_memory_bytes /
+                                          perf_test_args.guest_page_size;
+                       vcpu_gpa = guest_test_phys_mem +
+                                  (vcpu_id * vcpu_memory_bytes);
+               } else {
+                       vcpu_args->gva = guest_test_virt_mem;
+                       vcpu_args->pages = (vcpus * vcpu_memory_bytes) /
+                                          perf_test_args.guest_page_size;
+                       vcpu_gpa = guest_test_phys_mem;
+               }
 
-               vcpu_gpa = guest_test_phys_mem + (vcpu_id * vcpu_memory_bytes);
                pr_debug("Added VCPU %d with test mem gpa [%lx, %lx)\n",
-                        vcpu_id, vcpu_gpa, vcpu_gpa + vcpu_memory_bytes);
+                        vcpu_id, vcpu_gpa, vcpu_gpa +
+                        (vcpu_args->pages * perf_test_args.guest_page_size));
        }
 }
index 8e04c0b..906c955 100644 (file)
@@ -10,6 +10,7 @@
 #include <limits.h>
 #include <stdlib.h>
 #include <time.h>
+#include "linux/kernel.h"
 
 #include "test_util.h"
 
@@ -84,7 +85,7 @@ struct timespec timespec_sub(struct timespec ts1, struct timespec ts2)
        return timespec_add_ns((struct timespec){0}, ns1 - ns2);
 }
 
-struct timespec timespec_diff_now(struct timespec start)
+struct timespec timespec_elapsed(struct timespec start)
 {
        struct timespec end;
 
@@ -109,3 +110,31 @@ void print_skip(const char *fmt, ...)
        va_end(ap);
        puts(", skipping test");
 }
+
+const struct vm_mem_backing_src_alias backing_src_aliases[] = {
+       {"anonymous", VM_MEM_SRC_ANONYMOUS,},
+       {"anonymous_thp", VM_MEM_SRC_ANONYMOUS_THP,},
+       {"anonymous_hugetlb", VM_MEM_SRC_ANONYMOUS_HUGETLB,},
+};
+
+void backing_src_help(void)
+{
+       int i;
+
+       printf("Available backing src types:\n");
+       for (i = 0; i < ARRAY_SIZE(backing_src_aliases); i++)
+               printf("\t%s\n", backing_src_aliases[i].name);
+}
+
+enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name)
+{
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(backing_src_aliases); i++)
+               if (!strcmp(type_name, backing_src_aliases[i].name))
+                       return backing_src_aliases[i].type;
+
+       backing_src_help();
+       TEST_FAIL("Unknown backing src type: %s", type_name);
+       return -1;
+}
index 95e1a75..de0c761 100644 (file)
@@ -669,6 +669,82 @@ struct kvm_cpuid2 *kvm_get_supported_cpuid(void)
        return cpuid;
 }
 
+/*
+ * KVM Get MSR
+ *
+ * Input Args:
+ *   msr_index - Index of MSR
+ *
+ * Output Args: None
+ *
+ * Return: On success, value of the MSR. On failure a TEST_ASSERT is produced.
+ *
+ * Get value of MSR for VCPU.
+ */
+uint64_t kvm_get_feature_msr(uint64_t msr_index)
+{
+       struct {
+               struct kvm_msrs header;
+               struct kvm_msr_entry entry;
+       } buffer = {};
+       int r, kvm_fd;
+
+       buffer.header.nmsrs = 1;
+       buffer.entry.index = msr_index;
+       kvm_fd = open(KVM_DEV_PATH, O_RDONLY);
+       if (kvm_fd < 0)
+               exit(KSFT_SKIP);
+
+       r = ioctl(kvm_fd, KVM_GET_MSRS, &buffer.header);
+       TEST_ASSERT(r == 1, "KVM_GET_MSRS IOCTL failed,\n"
+               "  rc: %i errno: %i", r, errno);
+
+       close(kvm_fd);
+       return buffer.entry.data;
+}
+
+/*
+ * VM VCPU CPUID Set
+ *
+ * Input Args:
+ *   vm - Virtual Machine
+ *   vcpuid - VCPU id
+ *
+ * Output Args: None
+ *
+ * Return: KVM CPUID (KVM_GET_CPUID2)
+ *
+ * Set the VCPU's CPUID.
+ */
+struct kvm_cpuid2 *vcpu_get_cpuid(struct kvm_vm *vm, uint32_t vcpuid)
+{
+       struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+       struct kvm_cpuid2 *cpuid;
+       int rc, max_ent;
+
+       TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+
+       cpuid = allocate_kvm_cpuid2();
+       max_ent = cpuid->nent;
+
+       for (cpuid->nent = 1; cpuid->nent <= max_ent; cpuid->nent++) {
+               rc = ioctl(vcpu->fd, KVM_GET_CPUID2, cpuid);
+               if (!rc)
+                       break;
+
+               TEST_ASSERT(rc == -1 && errno == E2BIG,
+                           "KVM_GET_CPUID2 should either succeed or give E2BIG: %d %d",
+                           rc, errno);
+       }
+
+       TEST_ASSERT(rc == 0, "KVM_GET_CPUID2 failed, rc: %i errno: %i",
+                   rc, errno);
+
+       return cpuid;
+}
+
+
+
 /*
  * Locate a cpuid entry.
  *
@@ -1224,3 +1300,71 @@ uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2,
                     : "b"(a0), "c"(a1), "d"(a2), "S"(a3));
        return r;
 }
+
+struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void)
+{
+       static struct kvm_cpuid2 *cpuid;
+       int ret;
+       int kvm_fd;
+
+       if (cpuid)
+               return cpuid;
+
+       cpuid = allocate_kvm_cpuid2();
+       kvm_fd = open(KVM_DEV_PATH, O_RDONLY);
+       if (kvm_fd < 0)
+               exit(KSFT_SKIP);
+
+       ret = ioctl(kvm_fd, KVM_GET_SUPPORTED_HV_CPUID, cpuid);
+       TEST_ASSERT(ret == 0, "KVM_GET_SUPPORTED_HV_CPUID failed %d %d\n",
+                   ret, errno);
+
+       close(kvm_fd);
+       return cpuid;
+}
+
+void vcpu_set_hv_cpuid(struct kvm_vm *vm, uint32_t vcpuid)
+{
+       static struct kvm_cpuid2 *cpuid_full;
+       struct kvm_cpuid2 *cpuid_sys, *cpuid_hv;
+       int i, nent = 0;
+
+       if (!cpuid_full) {
+               cpuid_sys = kvm_get_supported_cpuid();
+               cpuid_hv = kvm_get_supported_hv_cpuid();
+
+               cpuid_full = malloc(sizeof(*cpuid_full) +
+                                   (cpuid_sys->nent + cpuid_hv->nent) *
+                                   sizeof(struct kvm_cpuid_entry2));
+               if (!cpuid_full) {
+                       perror("malloc");
+                       abort();
+               }
+
+               /* Need to skip KVM CPUID leaves 0x400000xx */
+               for (i = 0; i < cpuid_sys->nent; i++) {
+                       if (cpuid_sys->entries[i].function >= 0x40000000 &&
+                           cpuid_sys->entries[i].function < 0x40000100)
+                               continue;
+                       cpuid_full->entries[nent] = cpuid_sys->entries[i];
+                       nent++;
+               }
+
+               memcpy(&cpuid_full->entries[nent], cpuid_hv->entries,
+                      cpuid_hv->nent * sizeof(struct kvm_cpuid_entry2));
+               cpuid_full->nent = nent + cpuid_hv->nent;
+       }
+
+       vcpu_set_cpuid(vm, vcpuid, cpuid_full);
+}
+
+struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vm *vm, uint32_t vcpuid)
+{
+       static struct kvm_cpuid2 *cpuid;
+
+       cpuid = allocate_kvm_cpuid2();
+
+       vcpu_ioctl(vm, vcpuid, KVM_GET_SUPPORTED_HV_CPUID, cpuid);
+
+       return cpuid;
+}
index 3a5c72e..827fe60 100644 (file)
@@ -74,7 +74,7 @@ void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_r
        wrmsr(MSR_VM_HSAVE_PA, svm->save_area_gpa);
 
        memset(vmcb, 0, sizeof(*vmcb));
-       asm volatile ("vmsave\n\t" : : "a" (vmcb_gpa) : "memory");
+       asm volatile ("vmsave %0\n\t" : : "a" (vmcb_gpa) : "memory");
        vmcb_set_seg(&save->es, get_es(), 0, -1U, data_seg_attr);
        vmcb_set_seg(&save->cs, get_cs(), 0, -1U, code_seg_attr);
        vmcb_set_seg(&save->ss, get_ss(), 0, -1U, data_seg_attr);
@@ -131,19 +131,19 @@ void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_r
 void run_guest(struct vmcb *vmcb, uint64_t vmcb_gpa)
 {
        asm volatile (
-               "vmload\n\t"
+               "vmload %[vmcb_gpa]\n\t"
                "mov rflags, %%r15\n\t" // rflags
                "mov %%r15, 0x170(%[vmcb])\n\t"
                "mov guest_regs, %%r15\n\t"     // rax
                "mov %%r15, 0x1f8(%[vmcb])\n\t"
                LOAD_GPR_C
-               "vmrun\n\t"
+               "vmrun %[vmcb_gpa]\n\t"
                SAVE_GPR_C
                "mov 0x170(%[vmcb]), %%r15\n\t" // rflags
                "mov %%r15, rflags\n\t"
                "mov 0x1f8(%[vmcb]), %%r15\n\t" // rax
                "mov %%r15, guest_regs\n\t"
-               "vmsave\n\t"
+               "vmsave %[vmcb_gpa]\n\t"
                : : [vmcb] "r" (vmcb), [vmcb_gpa] "a" (vmcb_gpa)
                : "r15", "memory");
 }
diff --git a/tools/testing/selftests/kvm/memslot_modification_stress_test.c b/tools/testing/selftests/kvm/memslot_modification_stress_test.c
new file mode 100644 (file)
index 0000000..6096bf0
--- /dev/null
@@ -0,0 +1,212 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KVM memslot modification stress test
+ * Adapted from demand_paging_test.c
+ *
+ * Copyright (C) 2018, Red Hat, Inc.
+ * Copyright (C) 2020, Google, Inc.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_name */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <asm/unistd.h>
+#include <time.h>
+#include <poll.h>
+#include <pthread.h>
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+#include <linux/userfaultfd.h>
+
+#include "perf_test_util.h"
+#include "processor.h"
+#include "test_util.h"
+#include "guest_modes.h"
+
+#define DUMMY_MEMSLOT_INDEX 7
+
+#define DEFAULT_MEMSLOT_MODIFICATION_ITERATIONS 10
+
+
+static int nr_vcpus = 1;
+static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE;
+
+static bool run_vcpus = true;
+
+static void *vcpu_worker(void *data)
+{
+       int ret;
+       struct perf_test_vcpu_args *vcpu_args =
+               (struct perf_test_vcpu_args *)data;
+       int vcpu_id = vcpu_args->vcpu_id;
+       struct kvm_vm *vm = perf_test_args.vm;
+       struct kvm_run *run;
+
+       vcpu_args_set(vm, vcpu_id, 1, vcpu_id);
+       run = vcpu_state(vm, vcpu_id);
+
+       /* Let the guest access its memory until a stop signal is received */
+       while (READ_ONCE(run_vcpus)) {
+               ret = _vcpu_run(vm, vcpu_id);
+               TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret);
+
+               if (get_ucall(vm, vcpu_id, NULL) == UCALL_SYNC)
+                       continue;
+
+               TEST_ASSERT(false,
+                           "Invalid guest sync status: exit_reason=%s\n",
+                           exit_reason_str(run->exit_reason));
+       }
+
+       return NULL;
+}
+
+struct memslot_antagonist_args {
+       struct kvm_vm *vm;
+       useconds_t delay;
+       uint64_t nr_modifications;
+};
+
+static void add_remove_memslot(struct kvm_vm *vm, useconds_t delay,
+                             uint64_t nr_modifications, uint64_t gpa)
+{
+       int i;
+
+       for (i = 0; i < nr_modifications; i++) {
+               usleep(delay);
+               vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, gpa,
+                                           DUMMY_MEMSLOT_INDEX, 1, 0);
+
+               vm_mem_region_delete(vm, DUMMY_MEMSLOT_INDEX);
+       }
+}
+
+struct test_params {
+       useconds_t memslot_modification_delay;
+       uint64_t nr_memslot_modifications;
+       bool partition_vcpu_memory_access;
+};
+
+static void run_test(enum vm_guest_mode mode, void *arg)
+{
+       struct test_params *p = arg;
+       pthread_t *vcpu_threads;
+       struct kvm_vm *vm;
+       int vcpu_id;
+
+       vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size,
+                                VM_MEM_SRC_ANONYMOUS);
+
+       perf_test_args.wr_fract = 1;
+
+       vcpu_threads = malloc(nr_vcpus * sizeof(*vcpu_threads));
+       TEST_ASSERT(vcpu_threads, "Memory allocation failed");
+
+       perf_test_setup_vcpus(vm, nr_vcpus, guest_percpu_mem_size,
+                             p->partition_vcpu_memory_access);
+
+       /* Export the shared variables to the guest */
+       sync_global_to_guest(vm, perf_test_args);
+
+       pr_info("Finished creating vCPUs\n");
+
+       for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++)
+               pthread_create(&vcpu_threads[vcpu_id], NULL, vcpu_worker,
+                              &perf_test_args.vcpu_args[vcpu_id]);
+
+       pr_info("Started all vCPUs\n");
+
+       add_remove_memslot(vm, p->memslot_modification_delay,
+                          p->nr_memslot_modifications,
+                          guest_test_phys_mem +
+                          (guest_percpu_mem_size * nr_vcpus) +
+                          perf_test_args.host_page_size +
+                          perf_test_args.guest_page_size);
+
+       run_vcpus = false;
+
+       /* Wait for the vcpu threads to quit */
+       for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++)
+               pthread_join(vcpu_threads[vcpu_id], NULL);
+
+       pr_info("All vCPU threads joined\n");
+
+       ucall_uninit(vm);
+       kvm_vm_free(vm);
+
+       free(vcpu_threads);
+}
+
+static void help(char *name)
+{
+       puts("");
+       printf("usage: %s [-h] [-m mode] [-d delay_usec]\n"
+              "          [-b memory] [-v vcpus] [-o] [-i iterations]\n", name);
+       guest_modes_help();
+       printf(" -d: add a delay between each iteration of adding and\n"
+              "     deleting a memslot in usec.\n");
+       printf(" -b: specify the size of the memory region which should be\n"
+              "     accessed by each vCPU. e.g. 10M or 3G.\n"
+              "     Default: 1G\n");
+       printf(" -v: specify the number of vCPUs to run.\n");
+       printf(" -o: Overlap guest memory accesses instead of partitioning\n"
+              "     them into a separate region of memory for each vCPU.\n");
+       printf(" -i: specify the number of iterations of adding and removing\n"
+              "     a memslot.\n"
+              "     Default: %d\n", DEFAULT_MEMSLOT_MODIFICATION_ITERATIONS);
+       puts("");
+       exit(0);
+}
+
+int main(int argc, char *argv[])
+{
+       int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS);
+       int opt;
+       struct test_params p = {
+               .memslot_modification_delay = 0,
+               .nr_memslot_modifications =
+                       DEFAULT_MEMSLOT_MODIFICATION_ITERATIONS,
+               .partition_vcpu_memory_access = true
+       };
+
+       guest_modes_append_default();
+
+       while ((opt = getopt(argc, argv, "hm:d:b:v:oi:")) != -1) {
+               switch (opt) {
+               case 'm':
+                       guest_modes_cmdline(optarg);
+                       break;
+               case 'd':
+                       p.memslot_modification_delay = strtoul(optarg, NULL, 0);
+                       TEST_ASSERT(p.memslot_modification_delay >= 0,
+                                   "A negative delay is not supported.");
+                       break;
+               case 'b':
+                       guest_percpu_mem_size = parse_size(optarg);
+                       break;
+               case 'v':
+                       nr_vcpus = atoi(optarg);
+                       TEST_ASSERT(nr_vcpus > 0 && nr_vcpus <= max_vcpus,
+                                   "Invalid number of vcpus, must be between 1 and %d",
+                                   max_vcpus);
+                       break;
+               case 'o':
+                       p.partition_vcpu_memory_access = false;
+                       break;
+               case 'i':
+                       p.nr_memslot_modifications = atoi(optarg);
+                       break;
+               case 'h':
+               default:
+                       help(argv[0]);
+                       break;
+               }
+       }
+
+       for_each_guest_mode(run_test, &p);
+
+       return 0;
+}
diff --git a/tools/testing/selftests/kvm/settings b/tools/testing/selftests/kvm/settings
new file mode 100644 (file)
index 0000000..6091b45
--- /dev/null
@@ -0,0 +1 @@
+timeout=120
index 37b8a78..ca22ee6 100644 (file)
@@ -99,6 +99,7 @@ int main(int argc, char *argv[])
                exit(KSFT_SKIP);
        }
 
+       vcpu_set_hv_cpuid(vm, VCPU_ID);
        vcpu_enable_evmcs(vm, VCPU_ID);
 
        run = vcpu_state(vm, VCPU_ID);
@@ -142,7 +143,7 @@ int main(int argc, char *argv[])
                /* Restore state in a new VM.  */
                kvm_vm_restart(vm, O_RDWR);
                vm_vcpu_add(vm, VCPU_ID);
-               vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+               vcpu_set_hv_cpuid(vm, VCPU_ID);
                vcpu_enable_evmcs(vm, VCPU_ID);
                vcpu_load_state(vm, VCPU_ID, state);
                run = vcpu_state(vm, VCPU_ID);
diff --git a/tools/testing/selftests/kvm/x86_64/get_cpuid_test.c b/tools/testing/selftests/kvm/x86_64/get_cpuid_test.c
new file mode 100644 (file)
index 0000000..9b78e88
--- /dev/null
@@ -0,0 +1,175 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2021, Red Hat Inc.
+ *
+ * Generic tests for KVM CPUID set/get ioctls
+ */
+#include <asm/kvm_para.h>
+#include <linux/kvm_para.h>
+#include <stdint.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+#define VCPU_ID 0
+
+/* CPUIDs known to differ */
+struct {
+       u32 function;
+       u32 index;
+} mangled_cpuids[] = {
+       {.function = 0xd, .index = 0},
+};
+
+static void test_guest_cpuids(struct kvm_cpuid2 *guest_cpuid)
+{
+       int i;
+       u32 eax, ebx, ecx, edx;
+
+       for (i = 0; i < guest_cpuid->nent; i++) {
+               eax = guest_cpuid->entries[i].function;
+               ecx = guest_cpuid->entries[i].index;
+
+               cpuid(&eax, &ebx, &ecx, &edx);
+
+               GUEST_ASSERT(eax == guest_cpuid->entries[i].eax &&
+                            ebx == guest_cpuid->entries[i].ebx &&
+                            ecx == guest_cpuid->entries[i].ecx &&
+                            edx == guest_cpuid->entries[i].edx);
+       }
+
+}
+
+static void test_cpuid_40000000(struct kvm_cpuid2 *guest_cpuid)
+{
+       u32 eax = 0x40000000, ebx, ecx = 0, edx;
+
+       cpuid(&eax, &ebx, &ecx, &edx);
+
+       GUEST_ASSERT(eax == 0x40000001);
+}
+
+static void guest_main(struct kvm_cpuid2 *guest_cpuid)
+{
+       GUEST_SYNC(1);
+
+       test_guest_cpuids(guest_cpuid);
+
+       GUEST_SYNC(2);
+
+       test_cpuid_40000000(guest_cpuid);
+
+       GUEST_DONE();
+}
+
+static bool is_cpuid_mangled(struct kvm_cpuid_entry2 *entrie)
+{
+       int i;
+
+       for (i = 0; i < sizeof(mangled_cpuids); i++) {
+               if (mangled_cpuids[i].function == entrie->function &&
+                   mangled_cpuids[i].index == entrie->index)
+                       return true;
+       }
+
+       return false;
+}
+
+static void check_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 *entrie)
+{
+       int i;
+
+       for (i = 0; i < cpuid->nent; i++) {
+               if (cpuid->entries[i].function == entrie->function &&
+                   cpuid->entries[i].index == entrie->index) {
+                       if (is_cpuid_mangled(entrie))
+                               return;
+
+                       TEST_ASSERT(cpuid->entries[i].eax == entrie->eax &&
+                                   cpuid->entries[i].ebx == entrie->ebx &&
+                                   cpuid->entries[i].ecx == entrie->ecx &&
+                                   cpuid->entries[i].edx == entrie->edx,
+                                   "CPUID 0x%x.%x differ: 0x%x:0x%x:0x%x:0x%x vs 0x%x:0x%x:0x%x:0x%x",
+                                   entrie->function, entrie->index,
+                                   cpuid->entries[i].eax, cpuid->entries[i].ebx,
+                                   cpuid->entries[i].ecx, cpuid->entries[i].edx,
+                                   entrie->eax, entrie->ebx, entrie->ecx, entrie->edx);
+                       return;
+               }
+       }
+
+       TEST_ASSERT(false, "CPUID 0x%x.%x not found", entrie->function, entrie->index);
+}
+
+static void compare_cpuids(struct kvm_cpuid2 *cpuid1, struct kvm_cpuid2 *cpuid2)
+{
+       int i;
+
+       for (i = 0; i < cpuid1->nent; i++)
+               check_cpuid(cpuid2, &cpuid1->entries[i]);
+
+       for (i = 0; i < cpuid2->nent; i++)
+               check_cpuid(cpuid1, &cpuid2->entries[i]);
+}
+
+static void run_vcpu(struct kvm_vm *vm, uint32_t vcpuid, int stage)
+{
+       struct ucall uc;
+
+       _vcpu_run(vm, vcpuid);
+
+       switch (get_ucall(vm, vcpuid, &uc)) {
+       case UCALL_SYNC:
+               TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
+                           uc.args[1] == stage + 1,
+                           "Stage %d: Unexpected register values vmexit, got %lx",
+                           stage + 1, (ulong)uc.args[1]);
+               return;
+       case UCALL_DONE:
+               return;
+       case UCALL_ABORT:
+               TEST_ASSERT(false, "%s at %s:%ld\n\tvalues: %#lx, %#lx", (const char *)uc.args[0],
+                           __FILE__, uc.args[1], uc.args[2], uc.args[3]);
+       default:
+               TEST_ASSERT(false, "Unexpected exit: %s",
+                           exit_reason_str(vcpu_state(vm, vcpuid)->exit_reason));
+       }
+}
+
+struct kvm_cpuid2 *vcpu_alloc_cpuid(struct kvm_vm *vm, vm_vaddr_t *p_gva, struct kvm_cpuid2 *cpuid)
+{
+       int size = sizeof(*cpuid) + cpuid->nent * sizeof(cpuid->entries[0]);
+       vm_vaddr_t gva = vm_vaddr_alloc(vm, size,
+                                       getpagesize(), 0, 0);
+       struct kvm_cpuid2 *guest_cpuids = addr_gva2hva(vm, gva);
+
+       memcpy(guest_cpuids, cpuid, size);
+
+       *p_gva = gva;
+       return guest_cpuids;
+}
+
+int main(void)
+{
+       struct kvm_cpuid2 *supp_cpuid, *cpuid2;
+       vm_vaddr_t cpuid_gva;
+       struct kvm_vm *vm;
+       int stage;
+
+       vm = vm_create_default(VCPU_ID, 0, guest_main);
+
+       supp_cpuid = kvm_get_supported_cpuid();
+       cpuid2 = vcpu_get_cpuid(vm, VCPU_ID);
+
+       compare_cpuids(supp_cpuid, cpuid2);
+
+       vcpu_alloc_cpuid(vm, &cpuid_gva, cpuid2);
+
+       vcpu_args_set(vm, VCPU_ID, 1, cpuid_gva);
+
+       for (stage = 0; stage < 3; stage++)
+               run_vcpu(vm, VCPU_ID, stage);
+
+       kvm_vm_free(vm);
+}
index 88a595b..7e2d2d1 100644 (file)
@@ -125,30 +125,6 @@ void test_hv_cpuid_e2big(struct kvm_vm *vm, bool system)
                    " it should have: %d %d", system ? "KVM" : "vCPU", ret, errno);
 }
 
-
-struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(struct kvm_vm *vm, bool system)
-{
-       int nent = 20; /* should be enough */
-       static struct kvm_cpuid2 *cpuid;
-
-       cpuid = malloc(sizeof(*cpuid) + nent * sizeof(struct kvm_cpuid_entry2));
-
-       if (!cpuid) {
-               perror("malloc");
-               abort();
-       }
-
-       cpuid->nent = nent;
-
-       if (!system)
-               vcpu_ioctl(vm, VCPU_ID, KVM_GET_SUPPORTED_HV_CPUID, cpuid);
-       else
-               kvm_ioctl(vm, KVM_GET_SUPPORTED_HV_CPUID, cpuid);
-
-       return cpuid;
-}
-
-
 int main(int argc, char *argv[])
 {
        struct kvm_vm *vm;
@@ -167,7 +143,7 @@ int main(int argc, char *argv[])
        /* Test vCPU ioctl version */
        test_hv_cpuid_e2big(vm, false);
 
-       hv_cpuid_entries = kvm_get_supported_hv_cpuid(vm, false);
+       hv_cpuid_entries = vcpu_get_supported_hv_cpuid(vm, VCPU_ID);
        test_hv_cpuid(hv_cpuid_entries, false);
        free(hv_cpuid_entries);
 
@@ -177,7 +153,7 @@ int main(int argc, char *argv[])
                goto do_sys;
        }
        vcpu_enable_evmcs(vm, VCPU_ID);
-       hv_cpuid_entries = kvm_get_supported_hv_cpuid(vm, false);
+       hv_cpuid_entries = vcpu_get_supported_hv_cpuid(vm, VCPU_ID);
        test_hv_cpuid(hv_cpuid_entries, true);
        free(hv_cpuid_entries);
 
@@ -190,9 +166,8 @@ do_sys:
 
        test_hv_cpuid_e2big(vm, true);
 
-       hv_cpuid_entries = kvm_get_supported_hv_cpuid(vm, true);
+       hv_cpuid_entries = kvm_get_supported_hv_cpuid();
        test_hv_cpuid(hv_cpuid_entries, nested_vmx_supported());
-       free(hv_cpuid_entries);
 
 out:
        kvm_vm_free(vm);
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_pmu_msrs_test.c b/tools/testing/selftests/kvm/x86_64/vmx_pmu_msrs_test.c
new file mode 100644 (file)
index 0000000..23051d8
--- /dev/null
@@ -0,0 +1,131 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * VMX-pmu related msrs test
+ *
+ * Copyright (C) 2021 Intel Corporation
+ *
+ * Test to check the effect of various CPUID settings
+ * on the MSR_IA32_PERF_CAPABILITIES MSR, and check that
+ * whatever we write with KVM_SET_MSR is _not_ modified
+ * in the guest and test it can be retrieved with KVM_GET_MSR.
+ *
+ * Test to check that invalid LBR formats are rejected.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <sys/ioctl.h>
+
+#include "kvm_util.h"
+#include "vmx.h"
+
+#define VCPU_ID              0
+
+#define X86_FEATURE_PDCM       (1<<15)
+#define PMU_CAP_FW_WRITES      (1ULL << 13)
+#define PMU_CAP_LBR_FMT                0x3f
+
+union cpuid10_eax {
+       struct {
+               unsigned int version_id:8;
+               unsigned int num_counters:8;
+               unsigned int bit_width:8;
+               unsigned int mask_length:8;
+       } split;
+       unsigned int full;
+};
+
+union perf_capabilities {
+       struct {
+               u64     lbr_format:6;
+               u64     pebs_trap:1;
+               u64     pebs_arch_reg:1;
+               u64     pebs_format:4;
+               u64     smm_freeze:1;
+               u64     full_width_write:1;
+               u64 pebs_baseline:1;
+               u64     perf_metrics:1;
+               u64     pebs_output_pt_available:1;
+               u64     anythread_deprecated:1;
+       };
+       u64     capabilities;
+};
+
+static void guest_code(void)
+{
+       wrmsr(MSR_IA32_PERF_CAPABILITIES, PMU_CAP_LBR_FMT);
+}
+
+int main(int argc, char *argv[])
+{
+       struct kvm_cpuid2 *cpuid;
+       struct kvm_cpuid_entry2 *entry_1_0;
+       struct kvm_cpuid_entry2 *entry_a_0;
+       bool pdcm_supported = false;
+       struct kvm_vm *vm;
+       int ret;
+       union cpuid10_eax eax;
+       union perf_capabilities host_cap;
+
+       host_cap.capabilities = kvm_get_feature_msr(MSR_IA32_PERF_CAPABILITIES);
+       host_cap.capabilities &= (PMU_CAP_FW_WRITES | PMU_CAP_LBR_FMT);
+
+       /* Create VM */
+       vm = vm_create_default(VCPU_ID, 0, guest_code);
+       cpuid = kvm_get_supported_cpuid();
+
+       if (kvm_get_cpuid_max_basic() >= 0xa) {
+               entry_1_0 = kvm_get_supported_cpuid_index(1, 0);
+               entry_a_0 = kvm_get_supported_cpuid_index(0xa, 0);
+               pdcm_supported = entry_1_0 && !!(entry_1_0->ecx & X86_FEATURE_PDCM);
+               eax.full = entry_a_0->eax;
+       }
+       if (!pdcm_supported) {
+               print_skip("MSR_IA32_PERF_CAPABILITIES is not supported by the vCPU");
+               exit(KSFT_SKIP);
+       }
+       if (!eax.split.version_id) {
+               print_skip("PMU is not supported by the vCPU");
+               exit(KSFT_SKIP);
+       }
+
+       /* testcase 1, set capabilities when we have PDCM bit */
+       vcpu_set_cpuid(vm, VCPU_ID, cpuid);
+       vcpu_set_msr(vm, 0, MSR_IA32_PERF_CAPABILITIES, PMU_CAP_FW_WRITES);
+
+       /* check capabilities can be retrieved with KVM_GET_MSR */
+       ASSERT_EQ(vcpu_get_msr(vm, VCPU_ID, MSR_IA32_PERF_CAPABILITIES), PMU_CAP_FW_WRITES);
+
+       /* check whatever we write with KVM_SET_MSR is _not_ modified */
+       vcpu_run(vm, VCPU_ID);
+       ASSERT_EQ(vcpu_get_msr(vm, VCPU_ID, MSR_IA32_PERF_CAPABILITIES), PMU_CAP_FW_WRITES);
+
+       /* testcase 2, check valid LBR formats are accepted */
+       vcpu_set_msr(vm, 0, MSR_IA32_PERF_CAPABILITIES, 0);
+       ASSERT_EQ(vcpu_get_msr(vm, VCPU_ID, MSR_IA32_PERF_CAPABILITIES), 0);
+
+       vcpu_set_msr(vm, 0, MSR_IA32_PERF_CAPABILITIES, host_cap.lbr_format);
+       ASSERT_EQ(vcpu_get_msr(vm, VCPU_ID, MSR_IA32_PERF_CAPABILITIES), (u64)host_cap.lbr_format);
+
+       /* testcase 3, check invalid LBR format is rejected */
+       ret = _vcpu_set_msr(vm, 0, MSR_IA32_PERF_CAPABILITIES, PMU_CAP_LBR_FMT);
+       TEST_ASSERT(ret == 0, "Bad PERF_CAPABILITIES didn't fail.");
+
+       /* testcase 4, set capabilities when we don't have PDCM bit */
+       entry_1_0->ecx &= ~X86_FEATURE_PDCM;
+       vcpu_set_cpuid(vm, VCPU_ID, cpuid);
+       ret = _vcpu_set_msr(vm, 0, MSR_IA32_PERF_CAPABILITIES, host_cap.capabilities);
+       TEST_ASSERT(ret == 0, "Bad PERF_CAPABILITIES didn't fail.");
+
+       /* testcase 5, set capabilities when we don't have PMU version bits */
+       entry_1_0->ecx |= X86_FEATURE_PDCM;
+       eax.split.version_id = 0;
+       entry_1_0->ecx = eax.full;
+       vcpu_set_cpuid(vm, VCPU_ID, cpuid);
+       ret = _vcpu_set_msr(vm, 0, MSR_IA32_PERF_CAPABILITIES, PMU_CAP_FW_WRITES);
+       TEST_ASSERT(ret == 0, "Bad PERF_CAPABILITIES didn't fail.");
+
+       vcpu_set_msr(vm, 0, MSR_IA32_PERF_CAPABILITIES, 0);
+       ASSERT_EQ(vcpu_get_msr(vm, VCPU_ID, MSR_IA32_PERF_CAPABILITIES), 0);
+
+       kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c b/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c
new file mode 100644 (file)
index 0000000..2f964cd
--- /dev/null
@@ -0,0 +1,544 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * xapic_ipi_test
+ *
+ * Copyright (C) 2020, Google LLC.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ * Test that when the APIC is in xAPIC mode, a vCPU can send an IPI to wake
+ * another vCPU that is halted when KVM's backing page for the APIC access
+ * address has been moved by mm.
+ *
+ * The test starts two vCPUs: one that sends IPIs and one that continually
+ * executes HLT. The sender checks that the halter has woken from the HLT and
+ * has reentered HLT before sending the next IPI. While the vCPUs are running,
+ * the host continually calls migrate_pages to move all of the process' pages
+ * amongst the available numa nodes on the machine.
+ *
+ * Migration is a command line option. When used on non-numa machines will 
+ * exit with error. Test is still usefull on non-numa for testing IPIs.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <getopt.h>
+#include <pthread.h>
+#include <inttypes.h>
+#include <string.h>
+#include <time.h>
+
+#include "kvm_util.h"
+#include "numaif.h"
+#include "processor.h"
+#include "test_util.h"
+#include "vmx.h"
+
+/* Default running time for the test */
+#define DEFAULT_RUN_SECS 3
+
+/* Default delay between migrate_pages calls (microseconds) */
+#define DEFAULT_DELAY_USECS 500000
+
+#define HALTER_VCPU_ID 0
+#define SENDER_VCPU_ID 1
+
+volatile uint32_t *apic_base = (volatile uint32_t *)APIC_DEFAULT_GPA;
+
+/*
+ * Vector for IPI from sender vCPU to halting vCPU.
+ * Value is arbitrary and was chosen for the alternating bit pattern. Any
+ * value should work.
+ */
+#define IPI_VECTOR      0xa5
+
+/*
+ * Incremented in the IPI handler. Provides evidence to the sender that the IPI
+ * arrived at the destination
+ */
+static volatile uint64_t ipis_rcvd;
+
+/* Data struct shared between host main thread and vCPUs */
+struct test_data_page {
+       uint32_t halter_apic_id;
+       volatile uint64_t hlt_count;
+       volatile uint64_t wake_count;
+       uint64_t ipis_sent;
+       uint64_t migrations_attempted;
+       uint64_t migrations_completed;
+       uint32_t icr;
+       uint32_t icr2;
+       uint32_t halter_tpr;
+       uint32_t halter_ppr;
+
+       /*
+        *  Record local version register as a cross-check that APIC access
+        *  worked. Value should match what KVM reports (APIC_VERSION in
+        *  arch/x86/kvm/lapic.c). If test is failing, check that values match
+        *  to determine whether APIC access exits are working.
+        */
+       uint32_t halter_lvr;
+};
+
+struct thread_params {
+       struct test_data_page *data;
+       struct kvm_vm *vm;
+       uint32_t vcpu_id;
+       uint64_t *pipis_rcvd; /* host address of ipis_rcvd global */
+};
+
+uint32_t read_apic_reg(uint reg)
+{
+       return apic_base[reg >> 2];
+}
+
+void write_apic_reg(uint reg, uint32_t val)
+{
+       apic_base[reg >> 2] = val;
+}
+
+void disable_apic(void)
+{
+       wrmsr(MSR_IA32_APICBASE,
+             rdmsr(MSR_IA32_APICBASE) &
+               ~(MSR_IA32_APICBASE_ENABLE | MSR_IA32_APICBASE_EXTD));
+}
+
+void enable_xapic(void)
+{
+       uint64_t val = rdmsr(MSR_IA32_APICBASE);
+
+       /* Per SDM: to enable xAPIC when in x2APIC must first disable APIC */
+       if (val & MSR_IA32_APICBASE_EXTD) {
+               disable_apic();
+               wrmsr(MSR_IA32_APICBASE,
+                     rdmsr(MSR_IA32_APICBASE) | MSR_IA32_APICBASE_ENABLE);
+       } else if (!(val & MSR_IA32_APICBASE_ENABLE)) {
+               wrmsr(MSR_IA32_APICBASE, val | MSR_IA32_APICBASE_ENABLE);
+       }
+
+       /*
+        * Per SDM: reset value of spurious interrupt vector register has the
+        * APIC software enabled bit=0. It must be enabled in addition to the
+        * enable bit in the MSR.
+        */
+       val = read_apic_reg(APIC_SPIV) | APIC_SPIV_APIC_ENABLED;
+       write_apic_reg(APIC_SPIV, val);
+}
+
+void verify_apic_base_addr(void)
+{
+       uint64_t msr = rdmsr(MSR_IA32_APICBASE);
+       uint64_t base = GET_APIC_BASE(msr);
+
+       GUEST_ASSERT(base == APIC_DEFAULT_GPA);
+}
+
+static void halter_guest_code(struct test_data_page *data)
+{
+       verify_apic_base_addr();
+       enable_xapic();
+
+       data->halter_apic_id = GET_APIC_ID_FIELD(read_apic_reg(APIC_ID));
+       data->halter_lvr = read_apic_reg(APIC_LVR);
+
+       /*
+        * Loop forever HLTing and recording halts & wakes. Disable interrupts
+        * each time around to minimize window between signaling the pending
+        * halt to the sender vCPU and executing the halt. No need to disable on
+        * first run as this vCPU executes first and the host waits for it to
+        * signal going into first halt before starting the sender vCPU. Record
+        * TPR and PPR for diagnostic purposes in case the test fails.
+        */
+       for (;;) {
+               data->halter_tpr = read_apic_reg(APIC_TASKPRI);
+               data->halter_ppr = read_apic_reg(APIC_PROCPRI);
+               data->hlt_count++;
+               asm volatile("sti; hlt; cli");
+               data->wake_count++;
+       }
+}
+
+/*
+ * Runs on halter vCPU when IPI arrives. Write an arbitrary non-zero value to
+ * enable diagnosing errant writes to the APIC access address backing page in
+ * case of test failure.
+ */
+static void guest_ipi_handler(struct ex_regs *regs)
+{
+       ipis_rcvd++;
+       write_apic_reg(APIC_EOI, 77);
+}
+
+static void sender_guest_code(struct test_data_page *data)
+{
+       uint64_t last_wake_count;
+       uint64_t last_hlt_count;
+       uint64_t last_ipis_rcvd_count;
+       uint32_t icr_val;
+       uint32_t icr2_val;
+       uint64_t tsc_start;
+
+       verify_apic_base_addr();
+       enable_xapic();
+
+       /*
+        * Init interrupt command register for sending IPIs
+        *
+        * Delivery mode=fixed, per SDM:
+        *   "Delivers the interrupt specified in the vector field to the target
+        *    processor."
+        *
+        * Destination mode=physical i.e. specify target by its local APIC
+        * ID. This vCPU assumes that the halter vCPU has already started and
+        * set data->halter_apic_id.
+        */
+       icr_val = (APIC_DEST_PHYSICAL | APIC_DM_FIXED | IPI_VECTOR);
+       icr2_val = SET_APIC_DEST_FIELD(data->halter_apic_id);
+       data->icr = icr_val;
+       data->icr2 = icr2_val;
+
+       last_wake_count = data->wake_count;
+       last_hlt_count = data->hlt_count;
+       last_ipis_rcvd_count = ipis_rcvd;
+       for (;;) {
+               /*
+                * Send IPI to halter vCPU.
+                * First IPI can be sent unconditionally because halter vCPU
+                * starts earlier.
+                */
+               write_apic_reg(APIC_ICR2, icr2_val);
+               write_apic_reg(APIC_ICR, icr_val);
+               data->ipis_sent++;
+
+               /*
+                * Wait up to ~1 sec for halter to indicate that it has:
+                * 1. Received the IPI
+                * 2. Woken up from the halt
+                * 3. Gone back into halt
+                * Current CPUs typically run at 2.x Ghz which is ~2
+                * billion ticks per second.
+                */
+               tsc_start = rdtsc();
+               while (rdtsc() - tsc_start < 2000000000) {
+                       if ((ipis_rcvd != last_ipis_rcvd_count) &&
+                           (data->wake_count != last_wake_count) &&
+                           (data->hlt_count != last_hlt_count))
+                               break;
+               }
+
+               GUEST_ASSERT((ipis_rcvd != last_ipis_rcvd_count) &&
+                            (data->wake_count != last_wake_count) &&
+                            (data->hlt_count != last_hlt_count));
+
+               last_wake_count = data->wake_count;
+               last_hlt_count = data->hlt_count;
+               last_ipis_rcvd_count = ipis_rcvd;
+       }
+}
+
+static void *vcpu_thread(void *arg)
+{
+       struct thread_params *params = (struct thread_params *)arg;
+       struct ucall uc;
+       int old;
+       int r;
+       unsigned int exit_reason;
+
+       r = pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, &old);
+       TEST_ASSERT(r == 0,
+                   "pthread_setcanceltype failed on vcpu_id=%u with errno=%d",
+                   params->vcpu_id, r);
+
+       fprintf(stderr, "vCPU thread running vCPU %u\n", params->vcpu_id);
+       vcpu_run(params->vm, params->vcpu_id);
+       exit_reason = vcpu_state(params->vm, params->vcpu_id)->exit_reason;
+
+       TEST_ASSERT(exit_reason == KVM_EXIT_IO,
+                   "vCPU %u exited with unexpected exit reason %u-%s, expected KVM_EXIT_IO",
+                   params->vcpu_id, exit_reason, exit_reason_str(exit_reason));
+
+       if (get_ucall(params->vm, params->vcpu_id, &uc) == UCALL_ABORT) {
+               TEST_ASSERT(false,
+                           "vCPU %u exited with error: %s.\n"
+                           "Sending vCPU sent %lu IPIs to halting vCPU\n"
+                           "Halting vCPU halted %lu times, woke %lu times, received %lu IPIs.\n"
+                           "Halter TPR=%#x PPR=%#x LVR=%#x\n"
+                           "Migrations attempted: %lu\n"
+                           "Migrations completed: %lu\n",
+                           params->vcpu_id, (const char *)uc.args[0],
+                           params->data->ipis_sent, params->data->hlt_count,
+                           params->data->wake_count,
+                           *params->pipis_rcvd, params->data->halter_tpr,
+                           params->data->halter_ppr, params->data->halter_lvr,
+                           params->data->migrations_attempted,
+                           params->data->migrations_completed);
+       }
+
+       return NULL;
+}
+
+static void cancel_join_vcpu_thread(pthread_t thread, uint32_t vcpu_id)
+{
+       void *retval;
+       int r;
+
+       r = pthread_cancel(thread);
+       TEST_ASSERT(r == 0,
+                   "pthread_cancel on vcpu_id=%d failed with errno=%d",
+                   vcpu_id, r);
+
+       r = pthread_join(thread, &retval);
+       TEST_ASSERT(r == 0,
+                   "pthread_join on vcpu_id=%d failed with errno=%d",
+                   vcpu_id, r);
+       TEST_ASSERT(retval == PTHREAD_CANCELED,
+                   "expected retval=%p, got %p", PTHREAD_CANCELED,
+                   retval);
+}
+
+void do_migrations(struct test_data_page *data, int run_secs, int delay_usecs,
+                  uint64_t *pipis_rcvd)
+{
+       long pages_not_moved;
+       unsigned long nodemask = 0;
+       unsigned long nodemasks[sizeof(nodemask) * 8];
+       int nodes = 0;
+       time_t start_time, last_update, now;
+       time_t interval_secs = 1;
+       int i, r;
+       int from, to;
+       unsigned long bit;
+       uint64_t hlt_count;
+       uint64_t wake_count;
+       uint64_t ipis_sent;
+
+       fprintf(stderr, "Calling migrate_pages every %d microseconds\n",
+               delay_usecs);
+
+       /* Get set of first 64 numa nodes available */
+       r = get_mempolicy(NULL, &nodemask, sizeof(nodemask) * 8,
+                         0, MPOL_F_MEMS_ALLOWED);
+       TEST_ASSERT(r == 0, "get_mempolicy failed errno=%d", errno);
+
+       fprintf(stderr, "Numa nodes found amongst first %lu possible nodes "
+               "(each 1-bit indicates node is present): %#lx\n",
+               sizeof(nodemask) * 8, nodemask);
+
+       /* Init array of masks containing a single-bit in each, one for each
+        * available node. migrate_pages called below requires specifying nodes
+        * as bit masks.
+        */
+       for (i = 0, bit = 1; i < sizeof(nodemask) * 8; i++, bit <<= 1) {
+               if (nodemask & bit) {
+                       nodemasks[nodes] = nodemask & bit;
+                       nodes++;
+               }
+       }
+
+       TEST_ASSERT(nodes > 1,
+                   "Did not find at least 2 numa nodes. Can't do migration\n");
+
+       fprintf(stderr, "Migrating amongst %d nodes found\n", nodes);
+
+       from = 0;
+       to = 1;
+       start_time = time(NULL);
+       last_update = start_time;
+
+       ipis_sent = data->ipis_sent;
+       hlt_count = data->hlt_count;
+       wake_count = data->wake_count;
+
+       while ((int)(time(NULL) - start_time) < run_secs) {
+               data->migrations_attempted++;
+
+               /*
+                * migrate_pages with PID=0 will migrate all pages of this
+                * process between the nodes specified as bitmasks. The page
+                * backing the APIC access address belongs to this process
+                * because it is allocated by KVM in the context of the
+                * KVM_CREATE_VCPU ioctl. If that assumption ever changes this
+                * test may break or give a false positive signal.
+                */
+               pages_not_moved = migrate_pages(0, sizeof(nodemasks[from]),
+                                               &nodemasks[from],
+                                               &nodemasks[to]);
+               if (pages_not_moved < 0)
+                       fprintf(stderr,
+                               "migrate_pages failed, errno=%d\n", errno);
+               else if (pages_not_moved > 0)
+                       fprintf(stderr,
+                               "migrate_pages could not move %ld pages\n",
+                               pages_not_moved);
+               else
+                       data->migrations_completed++;
+
+               from = to;
+               to++;
+               if (to == nodes)
+                       to = 0;
+
+               now = time(NULL);
+               if (((now - start_time) % interval_secs == 0) &&
+                   (now != last_update)) {
+                       last_update = now;
+                       fprintf(stderr,
+                               "%lu seconds: Migrations attempted=%lu completed=%lu, "
+                               "IPIs sent=%lu received=%lu, HLTs=%lu wakes=%lu\n",
+                               now - start_time, data->migrations_attempted,
+                               data->migrations_completed,
+                               data->ipis_sent, *pipis_rcvd,
+                               data->hlt_count, data->wake_count);
+
+                       TEST_ASSERT(ipis_sent != data->ipis_sent &&
+                                   hlt_count != data->hlt_count &&
+                                   wake_count != data->wake_count,
+                                   "IPI, HLT and wake count have not increased "
+                                   "in the last %lu seconds. "
+                                   "HLTer is likely hung.\n", interval_secs);
+
+                       ipis_sent = data->ipis_sent;
+                       hlt_count = data->hlt_count;
+                       wake_count = data->wake_count;
+               }
+               usleep(delay_usecs);
+       }
+}
+
+void get_cmdline_args(int argc, char *argv[], int *run_secs,
+                     bool *migrate, int *delay_usecs)
+{
+       for (;;) {
+               int opt = getopt(argc, argv, "s:d:m");
+
+               if (opt == -1)
+                       break;
+               switch (opt) {
+               case 's':
+                       *run_secs = parse_size(optarg);
+                       break;
+               case 'm':
+                       *migrate = true;
+                       break;
+               case 'd':
+                       *delay_usecs = parse_size(optarg);
+                       break;
+               default:
+                       TEST_ASSERT(false,
+                                   "Usage: -s <runtime seconds>. Default is %d seconds.\n"
+                                   "-m adds calls to migrate_pages while vCPUs are running."
+                                   " Default is no migrations.\n"
+                                   "-d <delay microseconds> - delay between migrate_pages() calls."
+                                   " Default is %d microseconds.\n",
+                                   DEFAULT_RUN_SECS, DEFAULT_DELAY_USECS);
+               }
+       }
+}
+
+int main(int argc, char *argv[])
+{
+       int r;
+       int wait_secs;
+       const int max_halter_wait = 10;
+       int run_secs = 0;
+       int delay_usecs = 0;
+       struct test_data_page *data;
+       vm_vaddr_t test_data_page_vaddr;
+       bool migrate = false;
+       pthread_t threads[2];
+       struct thread_params params[2];
+       struct kvm_vm *vm;
+       uint64_t *pipis_rcvd;
+
+       get_cmdline_args(argc, argv, &run_secs, &migrate, &delay_usecs);
+       if (run_secs <= 0)
+               run_secs = DEFAULT_RUN_SECS;
+       if (delay_usecs <= 0)
+               delay_usecs = DEFAULT_DELAY_USECS;
+
+       vm = vm_create_default(HALTER_VCPU_ID, 0, halter_guest_code);
+       params[0].vm = vm;
+       params[1].vm = vm;
+
+       vm_init_descriptor_tables(vm);
+       vcpu_init_descriptor_tables(vm, HALTER_VCPU_ID);
+       vm_handle_exception(vm, IPI_VECTOR, guest_ipi_handler);
+
+       virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA, 0);
+
+       vm_vcpu_add_default(vm, SENDER_VCPU_ID, sender_guest_code);
+
+       test_data_page_vaddr = vm_vaddr_alloc(vm, 0x1000, 0x1000, 0, 0);
+       data =
+          (struct test_data_page *)addr_gva2hva(vm, test_data_page_vaddr);
+       memset(data, 0, sizeof(*data));
+       params[0].data = data;
+       params[1].data = data;
+
+       vcpu_args_set(vm, HALTER_VCPU_ID, 1, test_data_page_vaddr);
+       vcpu_args_set(vm, SENDER_VCPU_ID, 1, test_data_page_vaddr);
+
+       pipis_rcvd = (uint64_t *)addr_gva2hva(vm, (uint64_t)&ipis_rcvd);
+       params[0].pipis_rcvd = pipis_rcvd;
+       params[1].pipis_rcvd = pipis_rcvd;
+
+       /* Start halter vCPU thread and wait for it to execute first HLT. */
+       params[0].vcpu_id = HALTER_VCPU_ID;
+       r = pthread_create(&threads[0], NULL, vcpu_thread, &params[0]);
+       TEST_ASSERT(r == 0,
+                   "pthread_create halter failed errno=%d", errno);
+       fprintf(stderr, "Halter vCPU thread started\n");
+
+       wait_secs = 0;
+       while ((wait_secs < max_halter_wait) && !data->hlt_count) {
+               sleep(1);
+               wait_secs++;
+       }
+
+       TEST_ASSERT(data->hlt_count,
+                   "Halter vCPU did not execute first HLT within %d seconds",
+                   max_halter_wait);
+
+       fprintf(stderr,
+               "Halter vCPU thread reported its APIC ID: %u after %d seconds.\n",
+               data->halter_apic_id, wait_secs);
+
+       params[1].vcpu_id = SENDER_VCPU_ID;
+       r = pthread_create(&threads[1], NULL, vcpu_thread, &params[1]);
+       TEST_ASSERT(r == 0, "pthread_create sender failed errno=%d", errno);
+
+       fprintf(stderr,
+               "IPI sender vCPU thread started. Letting vCPUs run for %d seconds.\n",
+               run_secs);
+
+       if (!migrate)
+               sleep(run_secs);
+       else
+               do_migrations(data, run_secs, delay_usecs, pipis_rcvd);
+
+       /*
+        * Cancel threads and wait for them to stop.
+        */
+       cancel_join_vcpu_thread(threads[0], HALTER_VCPU_ID);
+       cancel_join_vcpu_thread(threads[1], SENDER_VCPU_ID);
+
+       fprintf(stderr,
+               "Test successful after running for %d seconds.\n"
+               "Sending vCPU sent %lu IPIs to halting vCPU\n"
+               "Halting vCPU halted %lu times, woke %lu times, received %lu IPIs.\n"
+               "Halter APIC ID=%#x\n"
+               "Sender ICR value=%#x ICR2 value=%#x\n"
+               "Halter TPR=%#x PPR=%#x LVR=%#x\n"
+               "Migrations attempted: %lu\n"
+               "Migrations completed: %lu\n",
+               run_secs, data->ipis_sent,
+               data->hlt_count, data->wake_count, *pipis_rcvd,
+               data->halter_apic_id,
+               data->icr, data->icr2,
+               data->halter_tpr, data->halter_ppr, data->halter_lvr,
+               data->migrations_attempted, data->migrations_completed);
+
+       kvm_vm_free(vm);
+
+       return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
new file mode 100644 (file)
index 0000000..9246ea3
--- /dev/null
@@ -0,0 +1,167 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * svm_vmcall_test
+ *
+ * Copyright © 2021 Amazon.com, Inc. or its affiliates.
+ *
+ * Xen shared_info / pvclock testing
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+#include <stdint.h>
+#include <time.h>
+
+#define VCPU_ID                5
+
+#define SHINFO_REGION_GPA      0xc0000000ULL
+#define SHINFO_REGION_SLOT     10
+#define PAGE_SIZE              4096
+
+#define PVTIME_ADDR    (SHINFO_REGION_GPA + PAGE_SIZE)
+
+static struct kvm_vm *vm;
+
+#define XEN_HYPERCALL_MSR      0x40000000
+
+struct pvclock_vcpu_time_info {
+        u32   version;
+        u32   pad0;
+        u64   tsc_timestamp;
+        u64   system_time;
+        u32   tsc_to_system_mul;
+        s8    tsc_shift;
+        u8    flags;
+        u8    pad[2];
+} __attribute__((__packed__)); /* 32 bytes */
+
+struct pvclock_wall_clock {
+        u32   version;
+        u32   sec;
+        u32   nsec;
+} __attribute__((__packed__));
+
+static void guest_code(void)
+{
+       GUEST_DONE();
+}
+
+static int cmp_timespec(struct timespec *a, struct timespec *b)
+{
+       if (a->tv_sec > b->tv_sec)
+               return 1;
+       else if (a->tv_sec < b->tv_sec)
+               return -1;
+       else if (a->tv_nsec > b->tv_nsec)
+               return 1;
+       else if (a->tv_nsec < b->tv_nsec)
+               return -1;
+       else
+               return 0;
+}
+
+int main(int argc, char *argv[])
+{
+       struct timespec min_ts, max_ts, vm_ts;
+
+       if (!(kvm_check_cap(KVM_CAP_XEN_HVM) &
+             KVM_XEN_HVM_CONFIG_SHARED_INFO) ) {
+               print_skip("KVM_XEN_HVM_CONFIG_SHARED_INFO not available");
+               exit(KSFT_SKIP);
+       }
+
+       clock_gettime(CLOCK_REALTIME, &min_ts);
+
+       vm = vm_create_default(VCPU_ID, 0, (void *) guest_code);
+       vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+
+       /* Map a region for the shared_info page */
+       vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+                                   SHINFO_REGION_GPA, SHINFO_REGION_SLOT, 2, 0);
+
+       struct kvm_xen_hvm_config hvmc = {
+               .flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL,
+               .msr = XEN_HYPERCALL_MSR,
+       };
+       vm_ioctl(vm, KVM_XEN_HVM_CONFIG, &hvmc);
+
+       struct kvm_xen_hvm_attr lm = {
+               .type = KVM_XEN_ATTR_TYPE_LONG_MODE,
+               .u.long_mode = 1,
+       };
+       vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &lm);
+
+       struct kvm_xen_hvm_attr ha = {
+               .type = KVM_XEN_ATTR_TYPE_SHARED_INFO,
+               .u.shared_info.gfn = SHINFO_REGION_GPA / PAGE_SIZE,
+       };
+       vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &ha);
+
+       struct kvm_xen_vcpu_attr vi = {
+               .type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO,
+               .u.gpa = SHINFO_REGION_GPA + 0x40,
+       };
+       vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &vi);
+
+       struct kvm_xen_vcpu_attr pvclock = {
+               .type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO,
+               .u.gpa = PVTIME_ADDR,
+       };
+       vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &pvclock);
+
+       for (;;) {
+               volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID);
+               struct ucall uc;
+
+               vcpu_run(vm, VCPU_ID);
+
+               TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+                           "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n",
+                           run->exit_reason,
+                           exit_reason_str(run->exit_reason));
+
+               switch (get_ucall(vm, VCPU_ID, &uc)) {
+               case UCALL_ABORT:
+                       TEST_FAIL("%s", (const char *)uc.args[0]);
+                       /* NOT REACHED */
+               case UCALL_SYNC:
+                       break;
+               case UCALL_DONE:
+                       goto done;
+               default:
+                       TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd);
+               }
+       }
+
+ done:
+       clock_gettime(CLOCK_REALTIME, &max_ts);
+
+       /*
+        * Just a *really* basic check that things are being put in the
+        * right place. The actual calculations are much the same for
+        * Xen as they are for the KVM variants, so no need to check.
+        */
+       struct pvclock_wall_clock *wc;
+       struct pvclock_vcpu_time_info *ti, *ti2;
+
+       wc = addr_gpa2hva(vm, SHINFO_REGION_GPA + 0xc00);
+       ti = addr_gpa2hva(vm, SHINFO_REGION_GPA + 0x40 + 0x20);
+       ti2 = addr_gpa2hva(vm, PVTIME_ADDR);
+
+       vm_ts.tv_sec = wc->sec;
+       vm_ts.tv_nsec = wc->nsec;
+        TEST_ASSERT(wc->version && !(wc->version & 1),
+                   "Bad wallclock version %x", wc->version);
+       TEST_ASSERT(cmp_timespec(&min_ts, &vm_ts) <= 0, "VM time too old");
+       TEST_ASSERT(cmp_timespec(&max_ts, &vm_ts) >= 0, "VM time too new");
+
+       TEST_ASSERT(ti->version && !(ti->version & 1),
+                   "Bad time_info version %x", ti->version);
+       TEST_ASSERT(ti2->version && !(ti2->version & 1),
+                   "Bad time_info version %x", ti->version);
+
+       kvm_vm_free(vm);
+       return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c b/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c
new file mode 100644 (file)
index 0000000..8389e0b
--- /dev/null
@@ -0,0 +1,149 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * xen_vmcall_test
+ *
+ * Copyright © 2020 Amazon.com, Inc. or its affiliates.
+ *
+ * Userspace hypercall testing
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+#define VCPU_ID                5
+
+#define HCALL_REGION_GPA       0xc0000000ULL
+#define HCALL_REGION_SLOT      10
+#define PAGE_SIZE              4096
+
+static struct kvm_vm *vm;
+
+#define INPUTVALUE 17
+#define ARGVALUE(x) (0xdeadbeef5a5a0000UL + x)
+#define RETVALUE 0xcafef00dfbfbffffUL
+
+#define XEN_HYPERCALL_MSR      0x40000200
+#define HV_GUEST_OS_ID_MSR     0x40000000
+#define HV_HYPERCALL_MSR       0x40000001
+
+#define HVCALL_SIGNAL_EVENT            0x005d
+#define HV_STATUS_INVALID_ALIGNMENT    4
+
+static void guest_code(void)
+{
+       unsigned long rax = INPUTVALUE;
+       unsigned long rdi = ARGVALUE(1);
+       unsigned long rsi = ARGVALUE(2);
+       unsigned long rdx = ARGVALUE(3);
+       unsigned long rcx;
+       register unsigned long r10 __asm__("r10") = ARGVALUE(4);
+       register unsigned long r8 __asm__("r8") = ARGVALUE(5);
+       register unsigned long r9 __asm__("r9") = ARGVALUE(6);
+
+       /* First a direct invocation of 'vmcall' */
+       __asm__ __volatile__("vmcall" :
+                            "=a"(rax) :
+                            "a"(rax), "D"(rdi), "S"(rsi), "d"(rdx),
+                            "r"(r10), "r"(r8), "r"(r9));
+       GUEST_ASSERT(rax == RETVALUE);
+
+       /* Fill in the Xen hypercall page */
+       __asm__ __volatile__("wrmsr" : : "c" (XEN_HYPERCALL_MSR),
+                            "a" (HCALL_REGION_GPA & 0xffffffff),
+                            "d" (HCALL_REGION_GPA >> 32));
+
+       /* Set Hyper-V Guest OS ID */
+       __asm__ __volatile__("wrmsr" : : "c" (HV_GUEST_OS_ID_MSR),
+                            "a" (0x5a), "d" (0));
+
+       /* Hyper-V hypercall page */
+       u64 msrval = HCALL_REGION_GPA + PAGE_SIZE + 1;
+       __asm__ __volatile__("wrmsr" : : "c" (HV_HYPERCALL_MSR),
+                            "a" (msrval & 0xffffffff),
+                            "d" (msrval >> 32));
+
+       /* Invoke a Xen hypercall */
+       __asm__ __volatile__("call *%1" : "=a"(rax) :
+                            "r"(HCALL_REGION_GPA + INPUTVALUE * 32),
+                            "a"(rax), "D"(rdi), "S"(rsi), "d"(rdx),
+                            "r"(r10), "r"(r8), "r"(r9));
+       GUEST_ASSERT(rax == RETVALUE);
+
+       /* Invoke a Hyper-V hypercall */
+       rax = 0;
+       rcx = HVCALL_SIGNAL_EVENT;      /* code */
+       rdx = 0x5a5a5a5a;               /* ingpa (badly aligned) */
+       __asm__ __volatile__("call *%1" : "=a"(rax) :
+                            "r"(HCALL_REGION_GPA + PAGE_SIZE),
+                            "a"(rax), "c"(rcx), "d"(rdx),
+                            "r"(r8));
+       GUEST_ASSERT(rax == HV_STATUS_INVALID_ALIGNMENT);
+
+       GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+       if (!(kvm_check_cap(KVM_CAP_XEN_HVM) &
+             KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL) ) {
+               print_skip("KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL not available");
+               exit(KSFT_SKIP);
+       }
+
+       vm = vm_create_default(VCPU_ID, 0, (void *) guest_code);
+       vcpu_set_hv_cpuid(vm, VCPU_ID);
+
+       struct kvm_xen_hvm_config hvmc = {
+               .flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL,
+               .msr = XEN_HYPERCALL_MSR,
+       };
+       vm_ioctl(vm, KVM_XEN_HVM_CONFIG, &hvmc);
+
+       /* Map a region for the hypercall pages */
+       vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+                                   HCALL_REGION_GPA, HCALL_REGION_SLOT, 2, 0);
+       virt_map(vm, HCALL_REGION_GPA, HCALL_REGION_GPA, 2, 0);
+
+       for (;;) {
+               volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID);
+               struct ucall uc;
+
+               vcpu_run(vm, VCPU_ID);
+
+               if (run->exit_reason == KVM_EXIT_XEN) {
+                       ASSERT_EQ(run->xen.type, KVM_EXIT_XEN_HCALL);
+                       ASSERT_EQ(run->xen.u.hcall.cpl, 0);
+                       ASSERT_EQ(run->xen.u.hcall.longmode, 1);
+                       ASSERT_EQ(run->xen.u.hcall.input, INPUTVALUE);
+                       ASSERT_EQ(run->xen.u.hcall.params[0], ARGVALUE(1));
+                       ASSERT_EQ(run->xen.u.hcall.params[1], ARGVALUE(2));
+                       ASSERT_EQ(run->xen.u.hcall.params[2], ARGVALUE(3));
+                       ASSERT_EQ(run->xen.u.hcall.params[3], ARGVALUE(4));
+                       ASSERT_EQ(run->xen.u.hcall.params[4], ARGVALUE(5));
+                       ASSERT_EQ(run->xen.u.hcall.params[5], ARGVALUE(6));
+                       run->xen.u.hcall.result = RETVALUE;
+                       continue;
+               }
+
+               TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+                           "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n",
+                           run->exit_reason,
+                           exit_reason_str(run->exit_reason));
+
+               switch (get_ucall(vm, VCPU_ID, &uc)) {
+               case UCALL_ABORT:
+                       TEST_FAIL("%s", (const char *)uc.args[0]);
+                       /* NOT REACHED */
+               case UCALL_SYNC:
+                       break;
+               case UCALL_DONE:
+                       goto done;
+               default:
+                       TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd);
+               }
+       }
+done:
+       kvm_vm_free(vm);
+       return 0;
+}
index 9d01299..7aafefc 100644 (file)
@@ -9,6 +9,7 @@
 #include <linux/vmalloc.h>
 #include <linux/kvm_dirty_ring.h>
 #include <trace/events/kvm.h>
+#include "mmu_lock.h"
 
 int __weak kvm_cpu_dirty_log_size(void)
 {
@@ -60,17 +61,16 @@ static void kvm_reset_dirty_gfn(struct kvm *kvm, u32 slot, u64 offset, u64 mask)
        if (!memslot || (offset + __fls(mask)) >= memslot->npages)
                return;
 
-       spin_lock(&kvm->mmu_lock);
+       KVM_MMU_LOCK(kvm);
        kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, offset, mask);
-       spin_unlock(&kvm->mmu_lock);
+       KVM_MMU_UNLOCK(kvm);
 }
 
 int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring, int index, u32 size)
 {
-       ring->dirty_gfns = vmalloc(size);
+       ring->dirty_gfns = vzalloc(size);
        if (!ring->dirty_gfns)
                return -ENOMEM;
-       memset(ring->dirty_gfns, 0, size);
 
        ring->size = size / sizeof(struct kvm_dirty_gfn);
        ring->soft_limit = ring->size - kvm_dirty_ring_get_rsvd_entries();
index 8367d88..001b9de 100644 (file)
@@ -58,6 +58,7 @@
 
 #include "coalesced_mmio.h"
 #include "async_pf.h"
+#include "mmu_lock.h"
 #include "vfio.h"
 
 #define CREATE_TRACE_POINTS
@@ -459,13 +460,15 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
        int idx;
 
        idx = srcu_read_lock(&kvm->srcu);
-       spin_lock(&kvm->mmu_lock);
+
+       KVM_MMU_LOCK(kvm);
+
        kvm->mmu_notifier_seq++;
 
        if (kvm_set_spte_hva(kvm, address, pte))
                kvm_flush_remote_tlbs(kvm);
 
-       spin_unlock(&kvm->mmu_lock);
+       KVM_MMU_UNLOCK(kvm);
        srcu_read_unlock(&kvm->srcu, idx);
 }
 
@@ -476,7 +479,7 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
        int need_tlb_flush = 0, idx;
 
        idx = srcu_read_lock(&kvm->srcu);
-       spin_lock(&kvm->mmu_lock);
+       KVM_MMU_LOCK(kvm);
        /*
         * The count increase must become visible at unlock time as no
         * spte can be established without taking the mmu_lock and
@@ -489,7 +492,7 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
        if (need_tlb_flush || kvm->tlbs_dirty)
                kvm_flush_remote_tlbs(kvm);
 
-       spin_unlock(&kvm->mmu_lock);
+       KVM_MMU_UNLOCK(kvm);
        srcu_read_unlock(&kvm->srcu, idx);
 
        return 0;
@@ -500,7 +503,7 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
 {
        struct kvm *kvm = mmu_notifier_to_kvm(mn);
 
-       spin_lock(&kvm->mmu_lock);
+       KVM_MMU_LOCK(kvm);
        /*
         * This sequence increase will notify the kvm page fault that
         * the page that is going to be mapped in the spte could have
@@ -514,7 +517,7 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
         * in conjunction with the smp_rmb in mmu_notifier_retry().
         */
        kvm->mmu_notifier_count--;
-       spin_unlock(&kvm->mmu_lock);
+       KVM_MMU_UNLOCK(kvm);
 
        BUG_ON(kvm->mmu_notifier_count < 0);
 }
@@ -528,13 +531,13 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
        int young, idx;
 
        idx = srcu_read_lock(&kvm->srcu);
-       spin_lock(&kvm->mmu_lock);
+       KVM_MMU_LOCK(kvm);
 
        young = kvm_age_hva(kvm, start, end);
        if (young)
                kvm_flush_remote_tlbs(kvm);
 
-       spin_unlock(&kvm->mmu_lock);
+       KVM_MMU_UNLOCK(kvm);
        srcu_read_unlock(&kvm->srcu, idx);
 
        return young;
@@ -549,7 +552,7 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
        int young, idx;
 
        idx = srcu_read_lock(&kvm->srcu);
-       spin_lock(&kvm->mmu_lock);
+       KVM_MMU_LOCK(kvm);
        /*
         * Even though we do not flush TLB, this will still adversely
         * affect performance on pre-Haswell Intel EPT, where there is
@@ -564,7 +567,7 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
         * more sophisticated heuristic later.
         */
        young = kvm_age_hva(kvm, start, end);
-       spin_unlock(&kvm->mmu_lock);
+       KVM_MMU_UNLOCK(kvm);
        srcu_read_unlock(&kvm->srcu, idx);
 
        return young;
@@ -578,9 +581,9 @@ static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
        int young, idx;
 
        idx = srcu_read_lock(&kvm->srcu);
-       spin_lock(&kvm->mmu_lock);
+       KVM_MMU_LOCK(kvm);
        young = kvm_test_age_hva(kvm, address);
-       spin_unlock(&kvm->mmu_lock);
+       KVM_MMU_UNLOCK(kvm);
        srcu_read_unlock(&kvm->srcu, idx);
 
        return young;
@@ -745,7 +748,7 @@ static struct kvm *kvm_create_vm(unsigned long type)
        if (!kvm)
                return ERR_PTR(-ENOMEM);
 
-       spin_lock_init(&kvm->mmu_lock);
+       KVM_MMU_LOCK_INIT(kvm);
        mmgrab(current->mm);
        kvm->mm = current->mm;
        kvm_eventfd_init(kvm);
@@ -1525,7 +1528,7 @@ static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log)
                dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
                memset(dirty_bitmap_buffer, 0, n);
 
-               spin_lock(&kvm->mmu_lock);
+               KVM_MMU_LOCK(kvm);
                for (i = 0; i < n / sizeof(long); i++) {
                        unsigned long mask;
                        gfn_t offset;
@@ -1541,7 +1544,7 @@ static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log)
                        kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
                                                                offset, mask);
                }
-               spin_unlock(&kvm->mmu_lock);
+               KVM_MMU_UNLOCK(kvm);
        }
 
        if (flush)
@@ -1636,7 +1639,7 @@ static int kvm_clear_dirty_log_protect(struct kvm *kvm,
        if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n))
                return -EFAULT;
 
-       spin_lock(&kvm->mmu_lock);
+       KVM_MMU_LOCK(kvm);
        for (offset = log->first_page, i = offset / BITS_PER_LONG,
                 n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--;
             i++, offset += BITS_PER_LONG) {
@@ -1659,7 +1662,7 @@ static int kvm_clear_dirty_log_protect(struct kvm *kvm,
                                                                offset, mask);
                }
        }
-       spin_unlock(&kvm->mmu_lock);
+       KVM_MMU_UNLOCK(kvm);
 
        if (flush)
                kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
@@ -1903,10 +1906,12 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
                               bool write_fault, bool *writable,
                               kvm_pfn_t *p_pfn)
 {
-       unsigned long pfn;
+       kvm_pfn_t pfn;
+       pte_t *ptep;
+       spinlock_t *ptl;
        int r;
 
-       r = follow_pfn(vma, addr, &pfn);
+       r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
        if (r) {
                /*
                 * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
@@ -1921,14 +1926,19 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
                if (r)
                        return r;
 
-               r = follow_pfn(vma, addr, &pfn);
+               r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
                if (r)
                        return r;
+       }
 
+       if (write_fault && !pte_write(*ptep)) {
+               pfn = KVM_PFN_ERR_RO_FAULT;
+               goto out;
        }
 
        if (writable)
-               *writable = true;
+               *writable = pte_write(*ptep);
+       pfn = pte_pfn(*ptep);
 
        /*
         * Get a reference here because callers of *hva_to_pfn* and
@@ -1943,6 +1953,8 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
         */ 
        kvm_get_pfn(pfn);
 
+out:
+       pte_unmap_unlock(ptep, ptl);
        *p_pfn = pfn;
        return 0;
 }
diff --git a/virt/kvm/mmu_lock.h b/virt/kvm/mmu_lock.h
new file mode 100644 (file)
index 0000000..9e1308f
--- /dev/null
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#ifndef KVM_MMU_LOCK_H
+#define KVM_MMU_LOCK_H 1
+
+/*
+ * Architectures can choose whether to use an rwlock or spinlock
+ * for the mmu_lock.  These macros, for use in common code
+ * only, avoids using #ifdefs in places that must deal with
+ * multiple architectures.
+ */
+
+#ifdef KVM_HAVE_MMU_RWLOCK
+#define KVM_MMU_LOCK_INIT(kvm) rwlock_init(&(kvm)->mmu_lock)
+#define KVM_MMU_LOCK(kvm)      write_lock(&(kvm)->mmu_lock)
+#define KVM_MMU_UNLOCK(kvm)    write_unlock(&(kvm)->mmu_lock)
+#else
+#define KVM_MMU_LOCK_INIT(kvm) spin_lock_init(&(kvm)->mmu_lock)
+#define KVM_MMU_LOCK(kvm)      spin_lock(&(kvm)->mmu_lock)
+#define KVM_MMU_UNLOCK(kvm)    spin_unlock(&(kvm)->mmu_lock)
+#endif /* KVM_HAVE_MMU_RWLOCK */
+
+#endif